diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index cb43a6d8f..ccdfd6688 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -16,44 +16,19 @@ jobs: build-basic: runs-on: self-hosted env: - RTE_SDK: /data/dpdk/intel/dpdk-stable-18.11.2 - RTE_TARGET: x86_64-native-linuxapp-gcc + PKG_CONFIG_PATH: /data/dpdk/dpdklib/lib64/pkgconfig steps: - uses: actions/checkout@v2 - name: make - run: make -j32 + run: make -j - build-mlnx: - runs-on: self-hosted - env: - RTE_SDK: /data/dpdk/mlnx/dpdk-stable-18.11.2 - RTE_TARGET: x86_64-native-linuxapp-gcc - steps: - - uses: actions/checkout@v2 - - name: config - run: sed -i 's/^CONFIG_MLX5=./CONFIG_MLX5=y/' src/config.mk - - name: make - run: make -j32 - build-debug: runs-on: self-hosted env: - RTE_SDK: /data/dpdk/intel/dpdk-stable-18.11.2 - RTE_TARGET: x86_64-native-linuxapp-gcc + PKG_CONFIG_PATH: /data/dpdk/dpdklib/lib64/pkgconfig steps: - uses: actions/checkout@v2 - name: config run: sed -i 's/#CFLAGS +=/CFLAGS +=/' src/config.mk && sed -i 's/^#DEBUG := 1/DEBUG := 1/' src/Makefile - name: make - run: make -j32 - - build-olddpdk: - runs-on: self-hosted - env: - RTE_SDK: /data/dpdk/intel/dpdk-stable-17.11.6 - RTE_TARGET: x86_64-native-linuxapp-gcc - steps: - - uses: actions/checkout@v2 - - name: make - run: make -j32 - + run: make -j diff --git a/.github/workflows/run.yaml b/.github/workflows/run.yaml index cf3350f1b..41a77bcba 100644 --- a/.github/workflows/run.yaml +++ b/.github/workflows/run.yaml @@ -16,12 +16,11 @@ jobs: run-dpvs: runs-on: self-hosted env: - RTE_SDK: /data/dpdk/intel/dpdk-stable-18.11.2 - RTE_TARGET: x86_64-native-linuxapp-gcc + PKG_CONFIG_PATH: /data/dpdk/dpdklib/lib64/pkgconfig steps: - uses: actions/checkout@v2 - name: make - run: make -j32 + run: make -j - name: install run: make install - name: run-dpvs diff --git a/README.md b/README.md index 8db4504eb..b2bc6ed66 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ `DPVS` is a high performance **Layer-4 load balancer** based on [DPDK](http://dpdk.org). It's derived from Linux Virtual Server [LVS](http://www.linuxvirtualserver.org/) and its modification [alibaba/LVS](https://github.com/alibaba/LVS). -> The name `DPVS` comes from "DPDK-LVS". +> Notes: The name `DPVS` comes from "DPDK-LVS". ![dpvs.png](./pic/dpvs.png) @@ -52,7 +52,9 @@ This *quick start* is tested with the environment below. Other environments should also be OK if DPDK works, please check [dpdk.org](http://www.dpdk.org) for more info. * Please check this link for NICs supported by DPDK: http://dpdk.org/doc/nics. -* Note `flow-director` ([fdir](http://dpdk.org/doc/guides/nics/overview.html#id1)) is needed for `FNAT` and `SNAT` mode with multi-cores. +* Note `flow control` ([rte_flow](http://dpdk.org/doc/guides/nics/overview.html#id1)) is needed for `FNAT` and `SNAT` mode with multi-cores. + +> Notes: To let dpvs work properly with multi-cores, rte_flow items must support "ipv4, ipv6, tcp, udp" four items, and rte_flow actions must support "drop, queue" at least. ## Clone DPVS @@ -65,48 +67,49 @@ Well, let's start from DPDK then. ## DPDK setup. -Currently, `dpdk-stable-18.11.2` is recommended for `DPVS`. `dpdk-stable-17.11.2` and `dpdk-stable-17.11.6` are supported until the lifecycle end of DPVS v1.8. +Currently, `dpdk-stable-20.11.1` is recommended for `DPVS`, and we will not support dpdk version earlier than dpdk-20.11 any more. If you are still using earlier dpdk versions, such as `dpdk-stable-17.11.2`, `dpdk-stable-17.11.6` and `dpdk-stable-18.11.2`, please use earlier dpvs releases, such as [v1.8.10](https://github.com/iqiyi/dpvs/releases/tag/v1.8.10). -> You can skip this section if experienced with DPDK, and refer the [link](http://dpdk.org/doc/guides/linux_gsg/index.html) for details. +> Notes: You can skip this section if experienced with DPDK, and refer the [link](http://dpdk.org/doc/guides/linux_gsg/index.html) for details. ```bash -$ wget https://fast.dpdk.org/rel/dpdk-18.11.2.tar.xz # download from dpdk.org if link failed. -$ tar xf dpdk-18.11.2.tar.xz +$ wget https://fast.dpdk.org/rel/dpdk-20.11.1.tar.xz # download from dpdk.org if link failed. +$ tar xf dpdk-20.11.1.tar.xz ``` ### DPDK patchs There are some patches for DPDK to support extra features needed by DPVS. Apply them if needed. For example, there's a patch for DPDK `kni` driver for hardware multicast, apply it if you are to launch `ospfd` on `kni` device. -> Assuming we are in DPVS root directory and dpdk-stable-18.11.2 is under it, please note it's not mandatory, just for convenience. +> Notes: Assuming we are in DPVS root directory and dpdk-stable-20.11.1 is under it, please note it's not mandatory, just for convenience. ``` $ cd -$ cp patch/dpdk-stable-18.11.2/*.patch dpdk-stable-18.11.2/ -$ cd dpdk-stable-18.11.2/ +$ cp patch/dpdk-stable-20.11.1/*.patch dpdk-stable-20.11.1/ +$ cd dpdk-stable-20.11.1/ $ patch -p1 < 0001-kni-use-netlink-event-for-multicast-driver-part.patch -$ patch -p1 < 0002-net-support-variable-IP-header-len-for-checksum-API.patch +$ patch -p1 < 0002-pdump-change-dpdk-pdump-tool-for-dpvs.patch $ ... ``` -> It's advised to patch all if your are not sure about what they are meant for. +> Tips: It's advised to patch all if your are not sure about what they are meant for. ### DPDK build and install -Now build DPDK and export `RTE_SDK` env variable for DPDK app (DPVS). +Use meson-ninja to build DPDK libraries, and export environment variable `PKG_CONFIG_PATH` for DPDK app (DPVS). The `dpdk.mk` in DPVS checks the presence of libdpdk. ```bash -$ cd dpdk-stable-18.11.2/ -$ make config T=x86_64-native-linuxapp-gcc -Configuration done -$ make # or make -j40 to save time, where 40 is the cpu core number. -$ export RTE_SDK=$PWD -$ export RTE_TARGET=build +$ cd dpdk-stable-20.11.1 +$ mkdir dpdklib # user desired install folder +$ mkdir dpdkbuild # user desired build folder +$ meson -Denable_kmods=true -Dprefix=dpdklib dpdkbuild +$ ninja -C dpdkbuild +$ cd dpdkbuild; ninja install +$ export PKG_CONFIG_PATH=$(pwd)/../dpdklib/lib64/pkgconfig/libdpdk.pc ``` -In our tutorial, `RTE_TARGET` is set to the default "build", thus DPDK libs and header files can be found in `dpdk-stable-18.11.2/build`. +> Tips: You can use script [dpdk-build.sh](./scripts/dpdk-build.sh) to facilitate dpdk build. Run `dpdk-build.sh -h` for the usage of the script. -Now to set up DPDK hugepage, our test environment is NUMA system. For single-node system please refer to the [link](http://dpdk.org/doc/guides/linux_gsg/sys_reqs.html). +Next is to set up DPDK hugepage. Our test environment is NUMA system. For single-node system please refer to the [link](http://dpdk.org/doc/guides/linux_gsg/sys_reqs.html). ```bash $ # for NUMA machine @@ -117,40 +120,41 @@ $ mkdir /mnt/huge $ mount -t hugetlbfs nodev /mnt/huge ``` -Install kernel modules and bind NIC with `igb_uio` driver. Quick start uses only one NIC, normally we use 2 for FNAT cluster, even 4 for bonding mode. For example, suppose the NIC we would use to run DPVS is eth0, in the meantime, we still keep another standalone NIC eth1 for debugging. +Install kernel modules and bind NIC with `uio_pci_generic` driver. Quick start uses only one NIC, normally we use two for FNAT cluster, even four for bonding mode. For example, suppose the NIC we would use to run DPVS is eth0, in the meantime, we still keep another standalone NIC eth1 for debugging. ```bash -$ modprobe uio -$ cd dpdk-stable-18.11.2 +$ modprobe uio_pci_generic -$ insmod build/kmod/igb_uio.ko -$ insmod build/kmod/rte_kni.ko carrier=on +$ cd dpdk-stable-20.11.1 +$ insmod dpdkbuild/kernel/linux/kni/rte_kni.ko carrier=on $ ./usertools/dpdk-devbind.py --status -$ ifconfig eth0 down # assuming eth0 is 0000:06:00.0 -$ ./usertools/dpdk-devbind.py -b igb_uio 0000:06:00.0 +$ ifconfig eth0 down # assuming eth0 is 0000:06:00.0 +$ ./usertools/dpdk-devbind.py -b uio_pci_generic 0000:06:00.0 ``` -> Note that a kernel parameter `carrier` is added to `rte_kni.ko` since [DPDK v18.11](https://elixir.bootlin.com/dpdk/v18.11/source/kernel/linux/kni/kni_misc.c), and the default value for it is "off". We need to load `rte_kni.ko` with the extra parameter `carrier=on` to make KNI devices work properly. +> Notes: +> 1. An alternative to the `uio_pci_generic` is `igb_uio`, which is moved to a separated repository [dpdk-kmods](http://git.dpdk.org/dpdk-kmods). +> 2. A kernel module parameter `carrier` is added to `rte_kni.ko` since [DPDK v18.11](https://elixir.bootlin.com/dpdk/v18.11/source/kernel/linux/kni/kni_misc.c), and the default value for it is "off". We need to load `rte_kni.ko` with the extra parameter `carrier=on` to make KNI devices work properly. `dpdk-devbind.py -u` can be used to unbind driver and switch it back to Linux driver like `ixgbe`. You can also use `lspci` or `ethtool -i eth0` to check the NIC PCI bus-id. Please refer to [DPDK site](http://www.dpdk.org) for more details. -> Note: PMD of Mellanox NIC is built on top of libibverbs using the Raw Ethernet Accelerated Verbs AP. It doesn't rely on UIO/VFIO driver. Thus, Mellanox NICs should not bind the `igb_uio` driver. Refer to [Mellanox DPDK](https://community.mellanox.com/s/article/mellanox-dpdk) for details. +> Notes: PMD of Mellanox NIC is built on top of libibverbs using the Raw Ethernet Accelerated Verbs AP. It doesn't rely on UIO/VFIO driver. Thus, Mellanox NICs should not bind the `igb_uio` driver. Refer to [Mellanox DPDK](https://community.mellanox.com/s/article/mellanox-dpdk) for details. ## Build DPVS -It's simple, just set `RTE_SDK` and build it. +It's simple, just set `PKG_CONFIG_PATH` and build it. ```bash -$ cd dpdk-stable-18.11.2/ -$ export RTE_SDK=$PWD +$ export PKG_CONFIG_PATH= # normally located at dpdklib/lib64/pkgconfig/libdpdk.pc $ cd -$ make # or "make -j40" to speed up. +$ make # or "make -j" to speed up $ make install ``` - -> Build dependencies may be needed, such as `automake`, `libnl3`, `libnl-genl-3.0`, `openssl`, `popt` and `numactl`. You can install the missing dependencies by using the package manager of the system, e.g., `yum install popt-devel` (CentOS). +> Notes: +> 1. Build dependencies may be needed, such as `pkg-config`(version 0.29.2+),`automake`, `libnl3`, `libnl-genl-3.0`, `openssl`, `popt` and `numactl`. You can install the missing dependencies by using the package manager of the system, e.g., `yum install popt-devel` (CentOS). +> 2. Early `pkg-config` versions (v0.29.2 before) may cause dpvs build failure. If so, please upgrade this tool. Output files are installed to `dpvs/bin`. @@ -196,7 +200,7 @@ EAL: Error - exiting with code: 1 ``` >It means the NIC count of DPVS does not match `/etc/dpvs.conf`. Please use `dpdk-devbind` to adjust the NIC number or modify `dpvs.conf`. We'll improve this part to make DPVS more "clever" to avoid modify config file when NIC count does not match. -What config items does `dpvs.conf` support and how to configure them? Well, `DPVS` maintains a config item file `conf/dpvs.conf.items` which lists all supported config entries and corresponding feasible values. +What config items does `dpvs.conf` support? How to configure them? Well, `DPVS` maintains a config item file `conf/dpvs.conf.items` which lists all supported config entries and corresponding feasible values. Besides, some config sample files maintained as `./conf/dpvs.*.sample` show the configurations of dpvs in some specified cases. ## Test Full-NAT (FNAT) Load Balancer diff --git a/conf/dpvs.bond.conf.sample b/conf/dpvs.bond.conf.sample index f6e554c8d..aaea16e98 100644 --- a/conf/dpvs.bond.conf.sample +++ b/conf/dpvs.bond.conf.sample @@ -22,6 +22,7 @@ global_defs { netif_defs { pktpool_size 1048575 pktpool_cache 256 + fdir_mode perfect device dpdk0 { rx { @@ -33,11 +34,6 @@ netif_defs { queue_number 8 descriptor_number 1024 } - fdir { - mode perfect - pballoc 64k - status matched - } ! mtu 1500 ! promisc_mode ! kni_name dpdk0.kni @@ -53,11 +49,6 @@ netif_defs { queue_number 8 descriptor_number 1024 } - fdir { - mode perfect - pballoc 64k - status matched - } ! mtu 1500 ! promisc_mode ! kni_name dpdk1.kni @@ -74,11 +65,6 @@ netif_defs { queue_number 8 descriptor_number 1024 } - fdir { - mode perfect - pballoc 64k - status matched - } ! mtu 1500 ! promisc_mode ! kni_name dpdk2.kni @@ -94,11 +80,6 @@ netif_defs { queue_number 8 descriptor_number 1024 } - fdir { - mode perfect - pballoc 64k - status matched - } ! mtu 1500 ! promisc_mode ! kni_name dpdk3.kni @@ -109,6 +90,7 @@ netif_defs { slave dpdk0 slave dpdk1 primary dpdk0 + ! numa_node 1 ! /sys/bus/pci/devices/[slaves' pci]/numa_node kni_name bond0.kni } @@ -117,6 +99,7 @@ netif_defs { slave dpdk2 slave dpdk3 primary dpdk2 + ! numa_node 1 ! /sys/bus/pci/devices/[slaves' pci]/numa_node kni_name bond1.kni } } @@ -250,7 +233,7 @@ worker_defs { worker cpu8 { type slave cpu_id 8 - icmp_redirect_core + ! icmp_redirect_core port bond0 { rx_queue_ids 7 tx_queue_ids 7 @@ -386,5 +369,6 @@ ipvs_defs { ! sa_pool config sa_pool { - pool_hash_size 16 + pool_hash_size 16 + flow_enable on } diff --git a/conf/dpvs.conf.items b/conf/dpvs.conf.items index df579f7ff..e23c0316a 100644 --- a/conf/dpvs.conf.items +++ b/conf/dpvs.conf.items @@ -22,6 +22,7 @@ global_defs { netif_defs { pktpool_size 2097151 <65535, 1023-134217728> pktpool_cache 256 <256, 32-8192> + fdir_mode perfect # only for ixgbe device dpdk0 { rx { @@ -34,12 +35,6 @@ netif_defs { queue_number 6 <16, 0-16> descriptor_number 512 <512, 16-8192> } - fdir { - filter on - mode perfect - pballoc 64k <64k, 64k|128k|256k> - status matched - } ! mtu 1500 <1500,0-9000> ! promisc_mode ! kni_name dpdk0.kni @@ -61,12 +56,17 @@ netif_defs { ! kni_name dpdk1.kni } - device bond0 { + bonding bond0 { mode 4 <0-6> slave dpdk0 slave dpdk1 primary dpdk0 + numa_node 0 <0, int value from /sys/bus/pci/devices/[pci_bus]/numa_node> kni_name bond0.kni + + ! supported options: + ! dedicated_queues=on|enable|off|disable, default on + options OPT1=VAL1;OPT2=VAL2;... } } @@ -262,4 +262,5 @@ ipvs_defs { sa_pool { pool_hash_size 16 <16, 1-128> + flow_enable on } diff --git a/conf/dpvs.conf.sample b/conf/dpvs.conf.sample index f9baf3f7a..0585b80c4 100644 --- a/conf/dpvs.conf.sample +++ b/conf/dpvs.conf.sample @@ -22,6 +22,7 @@ global_defs { netif_defs { pktpool_size 1048575 pktpool_cache 256 + fdir_mode perfect device dpdk0 { rx { @@ -33,11 +34,6 @@ netif_defs { queue_number 8 descriptor_number 1024 } - fdir { - mode perfect - pballoc 64k - status matched - } ! mtu 1500 ! promisc_mode kni_name dpdk0.kni @@ -53,11 +49,6 @@ netif_defs { queue_number 8 descriptor_number 1024 } - fdir { - mode perfect - pballoc 64k - status matched - } ! mtu 1500 ! promisc_mode kni_name dpdk1.kni @@ -69,6 +60,7 @@ netif_defs { ! slave dpdk1 ! primary dpdk0 ! kni_name bond0.kni + ! options dedicated_queues=off # for mode 4 only !} } @@ -201,7 +193,7 @@ worker_defs { worker cpu8 { type slave cpu_id 8 - icmp_redirect_core + ! icmp_redirect_core port dpdk0 { rx_queue_ids 7 tx_queue_ids 7 @@ -337,5 +329,6 @@ ipvs_defs { ! sa_pool config sa_pool { - pool_hash_size 16 + pool_hash_size 16 + flow_enable on } diff --git a/conf/dpvs.conf.single-bond.sample b/conf/dpvs.conf.single-bond.sample index aec33d3a8..ce6a16562 100644 --- a/conf/dpvs.conf.single-bond.sample +++ b/conf/dpvs.conf.single-bond.sample @@ -32,11 +32,6 @@ netif_defs { queue_number 8 descriptor_number 1024 } - fdir { - mode perfect - pballoc 64k - status matched - } ! mtu 1500 ! promisc_mode ! kni_name dpdk0.kni @@ -52,22 +47,19 @@ netif_defs { queue_number 8 descriptor_number 1024 } - fdir { - mode perfect - pballoc 64k - status matched - } ! mtu 1500 ! promisc_mode ! kni_name dpdk2.kni } bonding bond0 { - mode 0 + mode 4 slave dpdk0 slave dpdk2 primary dpdk0 + ! numa_node 1 ! /sys/bus/pci/devices/[slaves' pci]/numa_node kni_name bond0.kni + options dedicated_queues=off } } @@ -158,7 +150,7 @@ worker_defs { worker cpu8 { type slave cpu_id 8 - icmp_redirect_core + ! icmp_redirect_core port bond0 { rx_queue_ids 7 tx_queue_ids 7 @@ -285,5 +277,6 @@ ipvs_defs { ! sa_pool config sa_pool { - pool_hash_size 16 + pool_hash_size 16 + flow_enable on } diff --git a/conf/dpvs.conf.single-nic.sample b/conf/dpvs.conf.single-nic.sample index 40a34dd99..faa58b9e4 100644 --- a/conf/dpvs.conf.single-nic.sample +++ b/conf/dpvs.conf.single-nic.sample @@ -32,11 +32,6 @@ netif_defs { queue_number 8 descriptor_number 1024 } - fdir { - mode perfect - pballoc 64k - status matched - } ! mtu 1500 ! promisc_mode kni_name dpdk0.kni @@ -130,7 +125,7 @@ worker_defs { worker cpu8 { type slave cpu_id 8 - icmp_redirect_core + ! icmp_redirect_core port dpdk0 { rx_queue_ids 7 tx_queue_ids 7 @@ -257,5 +252,6 @@ ipvs_defs { ! sa_pool config sa_pool { - pool_hash_size 16 + pool_hash_size 16 + flow_enable on } diff --git a/doc/faq.md b/doc/faq.md index 86f601aeb..e61ae86e9 100644 --- a/doc/faq.md +++ b/doc/faq.md @@ -27,12 +27,12 @@ DPVS Frequently Asked Questions (FAQ) Please try to follow `README.md` and `doc/tutorial.md` first. And if you still have problem, possible reasons are: -1. NIC do not support DPDK or *flow-director* (`fdir`), please check this [answer](#nic). -2. DPDK not compatible with Kernel Version, it cause build error, please refer to [DPDK.org](https://www.dpdk.org/) or consider upgrade the Kernel. +1. NIC does not support DPDK or *flow control* (`rte_flow`), please check this [answer](#nic). +2. DPDK is not compatible with Kernel Version, it cause build error, please refer to [DPDK.org](https://www.dpdk.org/) or consider upgrade the Kernel. 3. CPU core (`lcore`) and NIC queue's configure is miss-match. Please read `conf/*.sample`, note worker-CPU/NIC-queue are 1:1 mapping and you need one more cpu for master. 4. DPDK NIC's link is not up ? please check NIC cable first. -5. `curl` VIP in FullNAT mode fails (or sometime fails)? Please check if NIC support [fdir](#nic). +5. `curl` VIP in FullNAT mode fails (or sometime fails)? Please check if NIC support [rte_flow](#nic). 6. `curl` still fails. Please check route and arp by `dpip route show`, `dpip neigh show`. 6. The patchs in `patch/` are not applied. @@ -42,16 +42,28 @@ And you may find other similar issues and solutions from Github's issues list. ### Does my NIC support DPVS ? -Actaully, it's the question about if the NIC support DPDK as well as "flow-director (fdir)". +Actaully, it's the question about if the NIC support DPDK as well as "flow control(rte_flow)". -First, please make sure the NIC support `DPDK`, you can check the [link](https://core.dpdk.org/supported/). Second, DPVS's FullNAT/SNAT mode need flow-director feature, *unless you configure only one worker*. For `fdir` support, this [link](http://doc.dpdk.org/guides/nics/overview.html#id1) can be checked. +First, please make sure the NIC support `DPDK`, you can check the [link](https://core.dpdk.org/supported/). Second, DPVS's FullNAT/SNAT mode need flow control(rte_flow) feature, *unless you configure only one worker*. For `rte_flow` support, this [link](http://doc.dpdk.org/guides/nics/overview.html#id1) can be checked. -Please find the DPDK driver name according to your NIC by the first link. And check `fdir` support for each drivers from the matrix in the second link. +Please find the DPDK driver name according to your NIC by the first link. And check `rte_flow` support for each drivers from the matrix in the second link. 1. https://core.dpdk.org/supported/ 2. http://doc.dpdk.org/guides/nics/overview.html#id1 -> `Fdir` is replaced with `rte_flow` in the lastest DPDK. DPVS is making efforts to adapt to the change. +The PMD of your NIC should support the following rte_flow items, + +* ipv4 +* ipv6 +* tcp +* udp + +and the following rte_flow actions at least. + +* queue +* drop + +> If you are using only one worker, you can turn off dpvs flow control by setting `sa_pool/flow_enable` to `off` in dpvs.conf. @@ -106,9 +118,9 @@ Yes, it does support UDP. In order to get the real client IP/port in FullNAT mod ### Does DPVS support IP fragment ? -No, since connection table is per-lcore (per-CPU), and RSS/fdir are used for FNAT. Assuming RSS mode is TCP and fdir uses L4 info ``. Considered that IP fragment doesn't have L4 info, it needs reassembling first and re-schedule the pkt to **correct** lcore which the 5-tuple flow (connection) belongs to. +No, since connection table is per-lcore (per-CPU), and RSS/rte_flow are used for FNAT. Assuming RSS mode is TCP and rte_flow uses L4 info ``. Considered that IP fragment doesn't have L4 info, it needs reassembling first and re-schedule the packet to **correct** lcore which the 5-tuple flow (connection) belongs to. -May be someday in the future, we will support "pkt re-schedule" on lcores or use L3 (IP) info only for `RSS`/`FDIR`, then we may support fragment. But even we support fragment, it may hurt the performance (reassemble, re-schedule effort) or security. +May be someday in the future, we will support "packet re-schedule" on lcores or use L3 (IP) info only for `RSS` or `flow control`, then we may support fragment. But even we support fragment, it may hurt the performance (reassemble, re-schedule effort) or security. Actually, IPv4 fragment is not recommended, while IPv6 even not support fragment by fixed header, and do not allow re-fragment on middle-boxes. The applications, especially for the datagram-oriented apps, like UDP-apps, should perform PMTU discover algorithm to avoid fragment. TCP is sending sliced *segments*, notifying MSS to peer side and *PMTU discover* is built-in, TCP-app should not need worry about fragment. @@ -116,7 +128,7 @@ Actually, IPv4 fragment is not recommended, while IPv6 even not support fragment ### How to launch DPVS on Virtual Machine ? -Please refer to the [tutorial.md](../doc/tutorial.md), there's an exmaple to run DPVS on `Ubuntu`. Basically, you may need to reduce memory usage. And for VM's NIC, `fdir` is not supported, so if you want to config FullNAT/SNAT mode, you have to configure **only one** worker (cpu), and another CPU core for master. +Please refer to the [tutorial.md](../doc/tutorial.md), there's an exmaple to run DPVS on `Ubuntu`. Basically, you may need to reduce memory usage. And for VM's NIC, `rte_flow` is not supported, so if you want to config FullNAT/SNAT mode, you have to configure **only one** worker (cpu), and another CPU core for master. diff --git a/doc/tutorial.md b/doc/tutorial.md index 73ec78b68..bd69d37a3 100644 --- a/doc/tutorial.md +++ b/doc/tutorial.md @@ -431,9 +431,20 @@ virtual_server group 192.168.100.254-80 { } ``` -The keepalived config for backup is the same with Master, except the `state` should be 'BACKUP', and `priority` should be lower. +The keepalived config for backup is the same with Master, except + +* local address is not the same with MASTER, +* vrrp_instance `state` should be 'BACKUP', +* vrrp_instance `priority` should be lower. ``` +local_address_group laddr_g1 { + 192.168.100.202 dpdk0 # use DPDK interface + 192.168.100.203 dpdk0 # use DPDK interface +} + +... ... + vrrp_instance VI_1 { state BACKUP priority 80 @@ -447,12 +458,19 @@ Start `keepalived` on both Master and Backup. ./keepalived -f /etc/keepalived/keepalived.conf ``` -For **test only**, add `VIP` and *routes* to DPDK interface manually on Master. Do not set VIP on both master and backup, in practice they should be added to keepalived configure file. +Then, add *routes* to DPDK interface manually on both MASTER and BACKUP. ```bash -./dpip addr add 192.168.100.254/32 dev dpdk0 ./dpip route add 192.168.100.0/24 dev dpdk0 ``` +Lastly, configure dpdk0.kni to make keepalived's vrrp and health-check work properly. + +```bash +ip link set dpdk0.kni up +ip addr add 192.168.100.28/24 dev dpdk0.kni # assign an IP to dpdk0.kni +dpip route add 192.168.100.28/32 scope kni_host dev dpdk0 # route packets target at 192.168.100.28 to dpdk0.kni +``` +Note the dpdk0.kni's IP addresses should be different for MASTER and BACKUP. Check if parameters just set are correct: @@ -465,7 +483,7 @@ TCP 192.168.100.254:80 rr -> 192.168.100.2:80 FullNat 100 0 0 -> 192.168.100.3:80 FullNat 100 0 0 -$ ./dpip addr show +$ ./dpip addr show -s inet 192.168.100.254/32 scope global dpdk0 valid_lft forever preferred_lft forever inet 192.168.100.201/32 scope global dpdk0 @@ -474,8 +492,10 @@ inet 192.168.100.200/32 scope global dpdk0 valid_lft forever preferred_lft forever sa_used 0 sa_free 1032176 sa_miss 0 $ ./dpip route show +inet 192.168.100.28/32 via 0.0.0.0 src 0.0.0.0 dev dpdk0 mtu 1500 tos 0 scope kni_host metric 0 proto auto inet 192.168.100.200/32 via 0.0.0.0 src 0.0.0.0 dev dpdk0 mtu 1500 tos 0 scope host metric 0 proto auto inet 192.168.100.201/32 via 0.0.0.0 src 0.0.0.0 dev dpdk0 mtu 1500 tos 0 scope host metric 0 proto auto +inet 192.168.100.254/32 via 0.0.0.0 src 0.0.0.0 dev dpdk0 mtu 1500 tos 0 scope host metric 0 proto auto inet 192.168.100.0/24 via 0.0.0.0 src 0.0.0.0 dev dpdk0 mtu 1500 tos 0 scope link metric 0 proto auto $ ./ipvsadm -G @@ -492,7 +512,20 @@ client$ curl 192.168.100.254 Your ip:port : 192.168.100.146:42394 ``` -> We just explain how DPVS works with keepalived, and not verify if the master/backup feature provided by keepalived works. Please refer LVS docs if needed. +> Note: +> 1. We just explain how DPVS works with keepalived, and not verify if the master/backup feature provided by keepalived works. Please refer LVS docs if needed. +> 2. Keepalived master/backup failover may fail if switch enabled the ARP broadcast suppression (unfortunately often is the case). If you don't want to change configurations of your switch, decrease the number of gratuitous ARP packets sent by keepalived (dpvs) on failover may help. + +``` +global_defs { + ... ... + vrrp_garp_master_repeat 1 # repeat counts for master state gratuitous arp + vrrp_garp_master_delay 1 # time to relaunch gratuitous arp after failover for master, in second + vrrp_garp_master_refresh 600 # time interval to refresh gratuitous arp periodically(0 = none), in second + vrrp_garp_master_refresh_repeat 1 # repeat counts to refresh gratuitous arp periodically + ... ... +} +``` @@ -606,9 +639,9 @@ A strict limitation exists for DPVS NAT mode: **DPVS `NAT` mode can only work in * DPVS session entries are splited and distributed on lcores by RSS. * NAT forwarding requires both inbound and outbound traffic go through DPVS. * Only dest IP/port is translated in NAT forwarding, source IP/port is not changed. -* Very limited maximum flow director rules can be set for a NIC. +* Very limited maximum rte_flow rules can be set for a NIC. -So, if no other control of the traffic flow, outbound packets may arrive at different lcore from inbound packets. If so, outbound packets would be dropped because session lookup miss. Full-NAT fixes the problem by using Flow Director(FDIR). However, there are very limited rules can be added for a NIC, i.e. 8K for XT-540. Unlike Full-NAT, NAT does not have local IP/port, so FDIR rules can only be set on source IP/port, which means only thousands concurrency is supported. Therefore, FDIR is not feasible for NAT. +So, if no other control of the traffic flow, outbound packets may arrive at different lcore from inbound packets. If so, outbound packets would be dropped because session lookup miss. Full-NAT fixes the problem by using Flow Control (rte_flow). However, there are very limited rules can be added for a NIC, i.e. 8K for XT-540. Unlike Full-NAT, NAT does not have local IP/port, so flow rules can only be set on source IP/port, which means only thousands concurrency is supported. Therefore, rte_flow is not feasible for NAT. Whatever, we give a simple example for NAT mode. Remind it only works single lcore. @@ -961,31 +994,28 @@ DPVS supports IPv6-IPv4 for fullnat, which means VIP/client IP can be IPv6 and l ``` OSPF can just be configured like IPv6-IPv6. If you prefer keepalived, you can configure it like IPv6-IPv6 except real_server/local_address_group. -**IPv6 and Flow Director** +**IPv6 and Flow Control** -We found there exists some NICs do not (fully) support Flow Director for IPv6. -For example, 82599 10GE Controller do not support IPv6 *perfect mode*, and IPv4/IPv6 *signature mode* supports only one locall IP. - -If you would like to use Flow Director signature mode, add the following lines into the device configs of `dpvs.conf`: +We found there exists some NICs do not (fully) support Flow Control of IPv6 required by IPv6. +For example, the rte_flow of 82599 10GE Controller (ixgbe PMD) relies on an old fashion flow type `flow director` (fdir), which doesn't support IPv6 in its *perfect mode*, and support only one local IPv4 or IPv6 in its *signature mode*. DPVS supports the fdir mode config for compatibility. ``` -fdir { +netif_defs { + ... mode signature - pballoc 64k - status matched } ``` -Another method to avoid Flow Director problem is to use the redirect forwarding, which forwards the recieved packets to the right lcore where the session resides by using lockless DPDK rings. +Another method to avoid not (fully) supported rte_flow problem is to use the redirect forwarding, which forwards the recieved packets to the correct worker lcore where the session resides by using lockless DPDK rings. If you want to try this method, turn on the `redirect` switch in the `dpvs.conf`. ``` ipvs_defs { conn { - ...... + ... redirect on } - ...... + ... } ``` It should note that the redirect forwarding may harm performance to a certain degree. Keep it in `off` state unless you have no other solutions. @@ -1057,7 +1087,7 @@ Please also check `dpip tunnel help` for details. > Notes: > 1. RSS schedule all packets to same queue/CPU since underlay source IP may the same. > If one lcore's `sa_pool` gets full, `sa_miss` happens. This is not a problem for some NICs which support inner RSS for tunnelling. -> 2. `fdir`/`rss` won't works well on tunnel deivce, do not use tunnel for FNAT. +> 2. `rte_flow`/`rss` won't works well on tunnel deivce, do not use tunnel for FNAT. @@ -1128,7 +1158,7 @@ Now, `dpvs.conf` must be put at `/etc/dpvs.conf`, just copy it from `conf/dpvs.c $ cp conf/dpvs.conf.single-nic.sample /etc/dpvs.conf ``` -The NIC for Ubuntu may not support flow-director(fdir),for that case ,please use 'single worker',may decrease conn_pool_size . +The NIC for Ubuntu may not support flow control(rte_flow) required by DPVS. For that case, please use 'single worker', and disable flow control. ```bash queue_number 1 @@ -1150,6 +1180,9 @@ worker_defs { } } + sa_pool { + flow_enable off + } ``` diff --git a/include/conf/common.h b/include/conf/common.h index 00648ed67..7472ad8f1 100644 --- a/include/conf/common.h +++ b/include/conf/common.h @@ -138,6 +138,7 @@ extern const char *dpvs_strerror(int err); int get_numa_nodes(void); +int linux_get_link_status(const char *ifname, int *if_flags, char *if_flags_str, size_t len); int linux_set_if_mac(const char *ifname, const unsigned char mac[ETH_ALEN]); int linux_hw_mc_add(const char *ifname, const uint8_t hwma[ETH_ALEN]); int linux_hw_mc_del(const char *ifname, const uint8_t hwma[ETH_ALEN]); diff --git a/include/conf/eal_mem.h b/include/conf/eal_mem.h index 6506ca9a3..bab2410f5 100644 --- a/include/conf/eal_mem.h +++ b/include/conf/eal_mem.h @@ -35,7 +35,7 @@ enum { }; typedef struct eal_mem_seg_ret_s { - uint64_t phys_addr; + uint64_t iova; uint64_t virt_addr; uint64_t len; uint64_t hugepage_sz; @@ -52,7 +52,7 @@ typedef struct eal_all_mem_seg_ret_s { typedef struct eal_mem_zone_ret_s { char name[EAL_MEM_NAME_LEN]; - uint64_t phys_addr; + uint64_t iova; uint64_t virt_addr; uint64_t len; uint64_t hugepage_sz; diff --git a/include/conf/neigh.h b/include/conf/neigh.h index d4881030d..618cad2a7 100644 --- a/include/conf/neigh.h +++ b/include/conf/neigh.h @@ -33,14 +33,18 @@ enum { }; struct dp_vs_neigh_conf { - int af; - uint8_t flag; - uint32_t state; - union inet_addr ip_addr; - struct ether_addr eth_addr; - uint32_t que_num; - char ifname[IFNAMSIZ]; - uint8_t cid; + int af; + uint32_t state; + union inet_addr ip_addr; +#ifdef __DPVS__ + struct rte_ether_addr eth_addr; +#else + struct ether_addr eth_addr; +#endif + uint32_t que_num; + char ifname[IFNAMSIZ]; + uint8_t flag; + uint8_t cid; }__attribute__((__packed__)); struct dp_vs_neigh_conf_array { diff --git a/include/dpdk.h b/include/dpdk.h index 81d6465b3..2fdcd418d 100644 --- a/include/dpdk.h +++ b/include/dpdk.h @@ -57,6 +57,7 @@ #include #include #include +#include #include "mbuf.h" #ifdef CONFIG_DPVS_PDUMP #include diff --git a/include/ipv4.h b/include/ipv4.h index ce3fb3b63..cf95882f2 100644 --- a/include/ipv4.h +++ b/include/ipv4.h @@ -45,8 +45,8 @@ int ipv4_output(struct rte_mbuf *mbuf); * Transport Protocols */ struct inet_protocol { - /* mbuf->userdata can be used to get IPv4 header, - * save it if protocols need ->userdata for other purpose. */ + /* mbuf userdata (MBUF_FIELD_PROTO) can be used to get IPv4 header, + * save it if protocols need mbuf userdata (MBUF_FIELD_PROTO) for other purpose. */ int (*handler)(struct rte_mbuf *mbuf); }; @@ -117,15 +117,15 @@ struct ip4_stats; int ipv4_get_stats(struct ip4_stats *stats); int ip4_defrag(struct rte_mbuf *mbuf, int user); -uint32_t ip4_select_id(struct ipv4_hdr *iph); +uint32_t ip4_select_id(struct rte_ipv4_hdr *iph); int ipv4_local_out(struct rte_mbuf *mbuf); int ipv4_rcv_fin(struct rte_mbuf *mbuf); /* helper functions */ -static inline struct ipv4_hdr *ip4_hdr(const struct rte_mbuf *mbuf) +static inline struct rte_ipv4_hdr *ip4_hdr(const struct rte_mbuf *mbuf) { /* can only invoked at L3 */ - return rte_pktmbuf_mtod(mbuf, struct ipv4_hdr *); + return rte_pktmbuf_mtod(mbuf, struct rte_ipv4_hdr *); } static inline int ip4_hdrlen(const struct rte_mbuf *mbuf) @@ -133,16 +133,16 @@ static inline int ip4_hdrlen(const struct rte_mbuf *mbuf) return (ip4_hdr(mbuf)->version_ihl & 0xf) << 2; } -static inline void ip4_send_csum(struct ipv4_hdr *iph) +static inline void ip4_send_csum(struct rte_ipv4_hdr *iph) { iph->hdr_checksum = 0; iph->hdr_checksum = rte_ipv4_cksum(iph); } -static inline bool ip4_is_frag(struct ipv4_hdr *iph) +static inline bool ip4_is_frag(struct rte_ipv4_hdr *iph) { return (iph->fragment_offset - & htons(IPV4_HDR_MF_FLAG | IPV4_HDR_OFFSET_MASK)) != 0; + & htons(RTE_IPV4_HDR_MF_FLAG | RTE_IPV4_HDR_OFFSET_MASK)) != 0; } #endif /* __DPVS_IPV4_H__ */ diff --git a/include/ipvs/conn.h b/include/ipvs/conn.h index c49f535f2..b80acabef 100644 --- a/include/ipvs/conn.h +++ b/include/ipvs/conn.h @@ -81,7 +81,6 @@ struct dp_vs_conn_stats { rte_atomic64_t outbytes; } __rte_cache_aligned; -struct dp_vs_fdir_filt; struct dp_vs_proto; struct dp_vs_conn { @@ -121,10 +120,10 @@ struct dp_vs_conn { struct rte_mbuf *mbuf); /* L2 fast xmit */ - struct ether_addr in_smac; - struct ether_addr in_dmac; - struct ether_addr out_smac; - struct ether_addr out_dmac; + struct rte_ether_addr in_smac; + struct rte_ether_addr in_dmac; + struct rte_ether_addr out_smac; + struct rte_ether_addr out_dmac; /* route for neigbour */ struct netif_port *in_dev; /* inside to rs*/ diff --git a/include/ipvs/kcompat.h b/include/ipvs/kcompat.h index 2203b9312..2ffd760c9 100644 --- a/include/ipvs/kcompat.h +++ b/include/ipvs/kcompat.h @@ -53,7 +53,7 @@ * * Undefined if no bit exists, so code should check against 0 first. */ -inline unsigned long __ffs(unsigned long word); +unsigned long __ffs(unsigned long word); /** * fls - find last (most-significant) bit set @@ -62,7 +62,7 @@ inline unsigned long __ffs(unsigned long word); * This is defined the same way as ffs. * Note fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32. */ -inline int fls(unsigned int x); +int fls(unsigned int x); /** * taken from definition in include/linux/gcd.h diff --git a/include/ipvs/nat64.h b/include/ipvs/nat64.h index 4f397fe63..eb1017171 100644 --- a/include/ipvs/nat64.h +++ b/include/ipvs/nat64.h @@ -28,7 +28,7 @@ static inline int mbuf_nat6to4_len(struct rte_mbuf *mbuf) int len; offset = ip6_skip_exthdr(mbuf, offset, &nexthdr); - len = mbuf->pkt_len - offset + sizeof(struct ipv4_hdr); + len = mbuf->pkt_len - offset + sizeof(struct rte_ipv4_hdr); return len; } diff --git a/include/ipvs/proto_tcp.h b/include/ipvs/proto_tcp.h index 21ee6ef48..9f5162a85 100644 --- a/include/ipvs/proto_tcp.h +++ b/include/ipvs/proto_tcp.h @@ -100,8 +100,8 @@ struct tcp_state { #define sSA DPVS_TCP_S_SYNACK struct tcphdr *tcp_hdr(const struct rte_mbuf *mbuf); -void tcp4_send_csum(struct ipv4_hdr *iph, struct tcphdr *th); -void tcp6_send_csum(struct ipv6_hdr *iph, struct tcphdr *th); +void tcp4_send_csum(struct rte_ipv4_hdr *iph, struct tcphdr *th); +void tcp6_send_csum(struct rte_ipv6_hdr *iph, struct tcphdr *th); struct rte_mempool *get_mbuf_pool(const struct dp_vs_conn *conn, int dir); void install_proto_tcp_keywords(void); void tcp_keyword_value_init(void); diff --git a/include/ipvs/proto_udp.h b/include/ipvs/proto_udp.h index 3e379352e..66881f0ce 100644 --- a/include/ipvs/proto_udp.h +++ b/include/ipvs/proto_udp.h @@ -30,7 +30,7 @@ extern int g_defence_udp_drop; void install_proto_udp_keywords(void); void udp_keyword_value_init(void); -void udp4_send_csum(struct ipv4_hdr *iph, struct udp_hdr *uh); -void udp6_send_csum(struct ipv6_hdr *iph, struct udp_hdr *uh); +void udp4_send_csum(struct rte_ipv4_hdr *iph, struct rte_udp_hdr *uh); +void udp6_send_csum(struct rte_ipv6_hdr *iph, struct rte_udp_hdr *uh); #endif diff --git a/include/ipvs/sched.h b/include/ipvs/sched.h index 72e691b62..e26e24c3f 100644 --- a/include/ipvs/sched.h +++ b/include/ipvs/sched.h @@ -27,7 +27,6 @@ struct dp_vs_iphdr; struct dp_vs_scheduler { struct list_head n_list; char *name; -// rte_atomic32_t refcnt; struct dp_vs_dest * (*schedule)(struct dp_vs_service *svc, @@ -52,6 +51,8 @@ int dp_vs_unbind_scheduler(struct dp_vs_service *svc); int dp_vs_gcd_weight(struct dp_vs_service *svc); +struct list_head * dp_vs_sched_first_dest(const struct dp_vs_service *svc); + void dp_vs_scheduler_put(struct dp_vs_scheduler *scheduler); int register_dp_vs_scheduler(struct dp_vs_scheduler *scheduler); diff --git a/include/mbuf.h b/include/mbuf.h index aac4651a4..a08cf1bdc 100644 --- a/include/mbuf.h +++ b/include/mbuf.h @@ -39,6 +39,30 @@ s != NULL; \ s = n, n = s ? s->next : NULL) +#define MBUF_USERDATA(m, type, field) \ + (*((type *)(mbuf_userdata((m), (field))))) + +#define MBUF_USERDATA_CONST(m, type, field) \ + (*((type *)(mbuf_userdata_const((m), (field))))) + +typedef union { + void *hdr; + struct { + uint64_t l2_len:RTE_MBUF_L2_LEN_BITS; /* L2 Header Length */ + uint64_t l3_len:RTE_MBUF_L3_LEN_BITS; /* L3 Header Length */ + uint64_t l4_len:RTE_MBUF_L4_LEN_BITS; /* L4 Header Length */ + uint64_t outer_l2_len:RTE_MBUF_OUTL2_LEN_BITS; /* Outer L2 Header Length */ + uint64_t outer_l3_len:RTE_MBUF_OUTL3_LEN_BITS; /* Outer L3 Header Length */ + }; +} mbuf_userdata_field_proto_t; + +typedef void * mbuf_userdata_field_route_t; + +typedef enum { + MBUF_FIELD_PROTO = 0, + MBUF_FIELD_ROUTE, +} mbuf_usedata_field_t; + /** * mbuf_copy_bits - copy bits from mbuf to buffer. * see skb_copy_bits(). @@ -123,4 +147,14 @@ void mbuf_copy_metadata(struct rte_mbuf *mi, struct rte_mbuf *m); inline void dp_vs_mbuf_dump(const char *msg, int af, const struct rte_mbuf *mbuf); #endif +void *mbuf_userdata(struct rte_mbuf *, mbuf_usedata_field_t); +void *mbuf_userdata_const(const struct rte_mbuf *, mbuf_usedata_field_t); + +static inline void mbuf_userdata_reset(struct rte_mbuf *m) +{ + memset((void *)m->dynfield1, 0, sizeof(m->dynfield1)); +} + +int mbuf_init(void); + #endif /* __DP_VS_MBUF_H__ */ diff --git a/include/neigh.h b/include/neigh.h index 3590fc641..f29f6f30d 100644 --- a/include/neigh.h +++ b/include/neigh.h @@ -54,7 +54,7 @@ struct neighbour_entry { int af; struct list_head neigh_list; union inet_addr ip_addr; - struct ether_addr eth_addr; + struct rte_ether_addr eth_addr; struct netif_port *port; struct dpvs_timer timer; struct list_head queue_list; @@ -89,7 +89,7 @@ struct neighbour_entry *neigh_lookup_entry(int af, const union inet_addr *key, void neigh_send_mbuf_cach(struct neighbour_entry *neighbour); int neigh_edit(struct neighbour_entry *neighbour, - struct ether_addr *eth_addr); + struct rte_ether_addr *eth_addr); int neigh_init(void); @@ -105,7 +105,7 @@ int neigh_output(int af, struct netif_port *port); struct neighbour_entry *neigh_add_table(int af, const union inet_addr *ipaddr, - const struct ether_addr *eth_addr, + const struct rte_ether_addr *eth_addr, struct netif_port *port, unsigned int hashkey, int flag); @@ -118,7 +118,7 @@ void neigh_confirm(int af, union inet_addr *nexthop, struct netif_port *port); int neigh_sync_core(const void *param, bool add_del, enum param_kind kind); static inline void ipv6_mac_mult(const struct in6_addr *mult_target, - struct ether_addr *mult_eth) + struct rte_ether_addr *mult_eth) { uint8_t *w = (uint8_t *)mult_eth; w[0] = 0x33; diff --git a/include/netif.h b/include/netif.h index 0226457d8..c80a1746c 100644 --- a/include/netif.h +++ b/include/netif.h @@ -18,6 +18,7 @@ #ifndef __DPVS_NETIF_H__ #define __DPVS_NETIF_H__ #include +#include #include "list.h" #include "dpdk.h" #include "inetaddr.h" @@ -166,12 +167,12 @@ typedef enum { } port_type_t; struct netif_kni { - char name[IFNAMSIZ]; - struct rte_kni *kni; - struct ether_addr addr; - struct dpvs_timer kni_rtnl_timer; - int kni_rtnl_fd; - struct rte_ring *rx_ring; + char name[IFNAMSIZ]; + struct rte_kni *kni; + struct rte_ether_addr addr; + struct dpvs_timer kni_rtnl_timer; + int kni_rtnl_fd; + struct rte_ring *rx_ring; } __rte_cache_aligned; union netif_bond { @@ -192,10 +193,8 @@ struct netif_ops { int (*op_open)(struct netif_port *dev); int (*op_stop)(struct netif_port *dev); int (*op_xmit)(struct rte_mbuf *m, struct netif_port *dev); + int (*op_update_addr)(struct netif_port *dev); int (*op_set_mc_list)(struct netif_port *dev); - int (*op_filter_supported)(struct netif_port *dev, enum rte_filter_type fltype); - int (*op_set_fdir_filt)(struct netif_port *dev, enum rte_filter_op op, - const struct rte_eth_fdir_filter *filt); int (*op_get_queue)(struct netif_port *dev, lcoreid_t cid, queueid_t *qid); int (*op_get_link)(struct netif_port *dev, struct rte_eth_link *link); int (*op_get_promisc)(struct netif_port *dev, bool *promisc); @@ -204,7 +203,7 @@ struct netif_ops { struct netif_hw_addr { struct list_head list; - struct ether_addr addr; + struct rte_ether_addr addr; rte_atomic32_t refcnt; /* * - sync only once! @@ -236,7 +235,7 @@ struct netif_port { int ntxq; /* tx queue numbe */ uint16_t rxq_desc_nb; /* rx queue descriptor number */ uint16_t txq_desc_nb; /* tx queue descriptor number */ - struct ether_addr addr; /* MAC address */ + struct rte_ether_addr addr; /* MAC address */ struct netif_hw_addr_list mc; /* HW multicast list */ int socket; /* socket id */ int hw_header_len; /* HW header length */ @@ -279,10 +278,6 @@ int netif_register_pkt(struct pkt_type *pt); int netif_unregister_pkt(struct pkt_type *pt); /**************************** port API ******************************/ -int netif_fdir_filter_set(struct netif_port *port, enum rte_filter_op opcode, - const struct rte_eth_fdir_filter *fdir_flt); -void netif_mask_fdir_filter(int af, const struct netif_port *port, - struct rte_eth_fdir_filter *filt); struct netif_port* netif_port_get(portid_t id); /* port_conf can be NULL for default port configure */ int netif_print_port_conf(const struct rte_eth_conf *port_conf, char *buf, int *len); @@ -339,6 +334,4 @@ static inline uint16_t dpvs_rte_eth_dev_count(void) #endif } -extern bool dp_vs_fdir_filter_enable; - #endif /* __DPVS_NETIF_H__ */ diff --git a/include/netif_addr.h b/include/netif_addr.h index 929395ec3..1a6b97d71 100644 --- a/include/netif_addr.h +++ b/include/netif_addr.h @@ -25,16 +25,16 @@ #define __DPVS_NETIF_ADDR_H__ #include "netif.h" -int __netif_mc_add(struct netif_port *dev, const struct ether_addr *addr); -int __netif_mc_del(struct netif_port *dev, const struct ether_addr *addr); -int netif_mc_add(struct netif_port *dev, const struct ether_addr *addr); -int netif_mc_del(struct netif_port *dev, const struct ether_addr *addr); +int __netif_mc_add(struct netif_port *dev, const struct rte_ether_addr *addr); +int __netif_mc_del(struct netif_port *dev, const struct rte_ether_addr *addr); +int netif_mc_add(struct netif_port *dev, const struct rte_ether_addr *addr); +int netif_mc_del(struct netif_port *dev, const struct rte_ether_addr *addr); void netif_mc_flush(struct netif_port *dev); void netif_mc_init(struct netif_port *dev); int __netif_mc_dump(struct netif_port *dev, - struct ether_addr *addrs, size_t *naddr); + struct rte_ether_addr *addrs, size_t *naddr); int netif_mc_dump(struct netif_port *dev, - struct ether_addr *addrs, size_t *naddr); + struct rte_ether_addr *addrs, size_t *naddr); int __netif_mc_print(struct netif_port *dev, char *buf, int *len, int *pnaddr); int netif_mc_print(struct netif_port *dev, @@ -50,8 +50,8 @@ int netif_mc_sync_multiple(struct netif_port *to, struct netif_port *from); int __netif_mc_unsync_multiple(struct netif_port *to, struct netif_port *from); int netif_mc_unsync_multiple(struct netif_port *to, struct netif_port *from); -static inline int eth_addr_equal(const struct ether_addr *addr1, - const struct ether_addr *addr2) +static inline int eth_addr_equal(const struct rte_ether_addr *addr1, + const struct rte_ether_addr *addr2) { const uint16_t *a = (const uint16_t *)addr1; const uint16_t *b = (const uint16_t *)addr2; @@ -59,7 +59,7 @@ static inline int eth_addr_equal(const struct ether_addr *addr1, return ((a[0]^b[0]) | (a[1]^b[1]) | (a[2]^b[2])) == 0; } -static inline char *eth_addr_dump(const struct ether_addr *ea, +static inline char *eth_addr_dump(const struct rte_ether_addr *ea, char *buf, size_t size) { snprintf(buf, size, "%02x:%02x:%02x:%02x:%02x:%02x", diff --git a/include/netif_flow.h b/include/netif_flow.h new file mode 100644 index 000000000..372a40899 --- /dev/null +++ b/include/netif_flow.h @@ -0,0 +1,96 @@ +/* + * DPVS is a software load balancer (Virtual Server) based on DPDK. + * + * Copyright (C) 2020 iQIYI (www.iqiyi.com). + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#ifndef __NETIF_FLOW_H__ +#define __NETIF_FLOW_H__ + +#include "netif.h" + +struct netif_flow_handler { + portid_t pid; + void *handler; +}; + +typedef struct netif_flow_handler_param { + int size; + int flow_num; + struct netif_flow_handler *handlers; // pointing to an netif_flow_handler array from outside +} netif_flow_handler_param_t; + +/* + * Add sapool flow rules (for fullnat and snat). + * + * @param dev [in] + * Target device for the flow rules, supporting bonding/physical ports. + * @param cid [in] + * Lcore id to which to route the target flow. + * @param af [in] + * IP address family. + * @param addr [in] + * IP address of the sapool. + * @param port_base [in] + * TCP/UDP base port of the sapool. + * @param port_mask [in] + * TCP/UDP mask mask of the sapool. + * @param flows [out] + * Containing netif flow handlers if success, undefined otherwise. + * + * @return + * DPVS error code. + */ +int netif_sapool_flow_add(struct netif_port *dev, lcoreid_t cid, + int af, const union inet_addr *addr, + __be16 port_base, __be16 port_mask, + netif_flow_handler_param_t *flows); + +/* + * Delete saflow rules (for fullnat and snat). + * @param dev [in] + * Target device for the flow rules, supporting bonding/physical ports. + * @param cid [in] + * Lcore id to which to route the target flow. + * @param af [in] + * IP address family. + * @param addr [in] + * IP address of the sapool. + * @param port_base [in] + * TCP/UDP base port of the sapool. + * @param port_mask [in] + * TCP/UDP mask mask of the sapool. + * @param flows [in] + * Containing netif flow handlers to delete. + * + * @return + * DPVS error code. + */ +int netif_sapool_flow_del(struct netif_port *dev, lcoreid_t cid, + int af, const union inet_addr *addr, + __be16 port_base, __be16 port_mask, + netif_flow_handler_param_t *flows); + +/* + * Flush all flow rules on a port. * + * @param dev + * Target device, supporting bonding/physical ports. + * + * @return + * DPVS error code. + */ +int netif_flow_flush(struct netif_port *dev); + +#endif diff --git a/include/sa_pool.h b/include/sa_pool.h index dfcdc4ace..958aa0428 100644 --- a/include/sa_pool.h +++ b/include/sa_pool.h @@ -23,10 +23,10 @@ * ways to achieve the goal. one is to calc RSS the same way of * NIC to select the correct CPU for connect. * - * the way we use is based on Flow-Director (fdir), allocate + * the way we use is based on DPDK Generic Flow(rte_flow), allocate * local source (e.g., ) for each CPU core in advance. - * and redirect the back traffic to that CPU by fdir. it does not - * need two many fdir rules, the number of rules can be equal to + * and redirect the back traffic to that CPU by rte_flow. it does not + * need two many flow rules, the number of rules can be equal to * the number of CPU core. * * LVS use laddr and try to see if is used when @@ -42,9 +42,10 @@ #ifndef __DPVS_SA_POOL__ #define __DPVS_SA_POOL__ -#define MAX_PORT 65536 +#include "netif_flow.h" -#define MAX_FDIR_PROTO 2 +#define MAX_PORT 65536 +#define MAX_SA_FLOW 4 struct sa_pool_stats { uint32_t used_cnt; @@ -58,8 +59,7 @@ struct sa_pool_stats { * 2. use uint8_t flag * 3. remove sa_entry.addr, and get IP from sa_pool->ifa * 4. to __packed__ sa_entry. - * 5. alloc sa_entries[] for 65536/cpu_num only. - * 6. create sa_entry_pool only if pool_hash hit. + * 5. create sa_entry_pool only if pool_hash hit. * since when dest (like RS) num may small. */ @@ -87,21 +87,21 @@ struct sa_entry_pool { /* no lock needed because inet_ifaddr.sa_pool * is per-lcore. */ struct sa_pool { - struct inet_ifaddr *ifa; /* back-pointer */ + struct inet_ifaddr *ifa; /* back-pointer */ - uint16_t low; /* min port */ - uint16_t high; /* max port */ - rte_atomic32_t refcnt; + uint16_t low; /* min port */ + uint16_t high; /* max port */ + rte_atomic32_t refcnt; /* hashed pools by dest's . if no dest provided, * just use first pool. it's not need create/destroy pool * for each dest, that'll be too complicated. */ - struct sa_entry_pool *pool_hash; - uint8_t pool_hash_sz; - uint32_t flags; /* SA_POOL_F_XXX */ + struct sa_entry_pool *pool_hash; + uint8_t pool_hash_sz; + uint32_t flags; /* SA_POOL_F_XXX */ - /* fdir filter ID */ - uint32_t filter_id[MAX_FDIR_PROTO]; + int flow_num; + struct netif_flow_handler flows[MAX_SA_FLOW]; }; int sa_pool_init(void); diff --git a/patch/dpdk-stable-17.11.2/0001-kni-use-netlink-event-for-multicast-driver-part.patch b/patch/dpdk-stable-17.11.2/0001-kni-use-netlink-event-for-multicast-driver-part.patch deleted file mode 100644 index 05b10eba2..000000000 --- a/patch/dpdk-stable-17.11.2/0001-kni-use-netlink-event-for-multicast-driver-part.patch +++ /dev/null @@ -1,108 +0,0 @@ -From 52f4389c80b4b41386c53daf16d860305252f325 Mon Sep 17 00:00:00 2001 -From: Lei Chen -Date: Tue, 23 Jan 2018 12:39:56 +0800 -Subject: [PATCH 1/4] kni: use netlink event for multicast (driver part). - -kni driver send netlink event every time hw-multicast list updated by -kernel, the user kni app should capture the event and update multicast -to kni device. - -original way is using rte_kni_request to pass hw-multicast to user kni -module. that method works but finally memory corruption found, which is -not easy to address. ---- - lib/librte_eal/linuxapp/kni/kni_net.c | 68 +++++++++++++++++++++++++++++++++++ - 1 file changed, 68 insertions(+) - -diff --git a/lib/librte_eal/linuxapp/kni/kni_net.c b/lib/librte_eal/linuxapp/kni/kni_net.c -index db9f489..fab94d1 100644 ---- a/lib/librte_eal/linuxapp/kni/kni_net.c -+++ b/lib/librte_eal/linuxapp/kni/kni_net.c -@@ -35,6 +35,8 @@ - #include - #include - #include -+#include -+#include - - #include - #include -@@ -579,9 +581,75 @@ - return 0; - } - -+static size_t -+kni_nlmsg_size(void) -+{ -+ return NLMSG_ALIGN(sizeof(struct ifaddrmsg)) -+ + nla_total_size(4) /* IFA_ADDRESS */ -+ + nla_total_size(4) /* IFA_LOCAL */ -+ + nla_total_size(4) /* IFA_BROADCAST */ -+ + nla_total_size(IFNAMSIZ) /* IFA_LABEL */ -+ + nla_total_size(4) /* IFA_FLAGS */ -+ + nla_total_size(sizeof(struct ifa_cacheinfo)); /* IFA_CACHEINFO */ -+} -+ - static void - kni_net_set_rx_mode(struct net_device *dev) - { -+ /* -+ * send event to notify user (DPDK KNI app) that multicast list changed, -+ * so that it can monitor multicast join/leave and set HW mc-addrs to -+ * kni dev accordinglly. -+ * -+ * this event is just an notification, we do not save any mc-addr here -+ * (so attribute space for us). user kni app should get maddrs after -+ * receive this notification. -+ * -+ * I was expecting kernel send some rtnl event for multicast join/leave, -+ * but it doesn't. By checking the call-chain of SIOCADDMULTI (ip maddr, -+ * manages only hardware multicast) and IP_ADD_MEMBERSHIP (ip_mc_join_group, -+ * used to for IPv4 multicast), no rtnl event sent. -+ * -+ * so as workaround, modify kni driver here to send RTM_NEWADDR. -+ * it may not suitalbe to use this event for mcast, but that should works. -+ * hope that won't affect other listener to this event. -+ * -+ * previous solution was using rte_kni_request to pass hw-maddr list to user. -+ * it "works" for times but finally memory corruption found, which is -+ * not easy to address (lock was added and reviewed). That's why we use -+ * netlink event instead. -+ */ -+ struct sk_buff *skb; -+ struct net *net = dev_net(dev); -+ struct nlmsghdr *nlh; -+ struct ifaddrmsg *ifm; -+ -+ skb = nlmsg_new(kni_nlmsg_size(), GFP_ATOMIC); -+ if (!skb) -+ return; -+ -+ /* no other event for us ? */ -+ nlh = nlmsg_put(skb, 0, 0, RTM_NEWADDR, sizeof(*ifm), 0); -+ if (!nlh) { -+ kfree_skb(skb); -+ return; -+ } -+ -+ /* just send an notification so no other info */ -+ ifm = nlmsg_data(nlh); -+ memset(ifm, 0, sizeof(*ifm)); -+ ifm->ifa_family = AF_UNSPEC; -+ ifm->ifa_prefixlen = 0; -+ ifm->ifa_flags = 0; -+ ifm->ifa_scope = RT_SCOPE_NOWHERE; -+ ifm->ifa_index = 0; -+ -+ nlmsg_end(skb, nlh); -+ -+ /* other group ? */ -+ pr_debug("%s: rx-mode/multicast-list changed\n", __func__); -+ rtnl_notify(skb, net, 0, RTNLGRP_NOTIFY, NULL, GFP_ATOMIC); -+ return; - } - - static int --- -1.8.3.1 - diff --git a/patch/dpdk-stable-17.11.2/0002-net-support-variable-IP-header-len-for-checksum-API.patch b/patch/dpdk-stable-17.11.2/0002-net-support-variable-IP-header-len-for-checksum-API.patch deleted file mode 100644 index 96dd76ea8..000000000 --- a/patch/dpdk-stable-17.11.2/0002-net-support-variable-IP-header-len-for-checksum-API.patch +++ /dev/null @@ -1,48 +0,0 @@ -From a949f95267849630a750f1e72ee468d58b806589 Mon Sep 17 00:00:00 2001 -From: Lei Chen -Date: Tue, 6 Mar 2018 16:04:36 +0800 -Subject: [PATCH 2/4] net: support variable IP header len for checksum API. - -IPv4 checksum APIs use fixe IP header length, it will failed if there is -any IP option. Now calculating header length by "ihl" field, so that we -can support options. - -Signed-off-by: Lei Chen ---- - lib/librte_net/rte_ip.h | 6 +++--- - 1 file changed, 3 insertions(+), 3 deletions(-) - -diff --git a/lib/librte_net/rte_ip.h b/lib/librte_net/rte_ip.h -index 73ec398..e03f707 100644 ---- a/lib/librte_net/rte_ip.h -+++ b/lib/librte_net/rte_ip.h -@@ -314,7 +314,7 @@ struct ipv4_hdr { - rte_ipv4_cksum(const struct ipv4_hdr *ipv4_hdr) - { - uint16_t cksum; -- cksum = rte_raw_cksum(ipv4_hdr, sizeof(struct ipv4_hdr)); -+ cksum = rte_raw_cksum(ipv4_hdr, (ipv4_hdr->version_ihl & 0xf) * 4); - return (cksum == 0xffff) ? cksum : ~cksum; - } - -@@ -356,7 +356,7 @@ struct ipv4_hdr { - } else { - psd_hdr.len = rte_cpu_to_be_16( - (uint16_t)(rte_be_to_cpu_16(ipv4_hdr->total_length) -- - sizeof(struct ipv4_hdr))); -+ - (ipv4_hdr->version_ihl & 0xf) * 4)); - } - return rte_raw_cksum(&psd_hdr, sizeof(psd_hdr)); - } -@@ -381,7 +381,7 @@ struct ipv4_hdr { - uint32_t l4_len; - - l4_len = rte_be_to_cpu_16(ipv4_hdr->total_length) - -- sizeof(struct ipv4_hdr); -+ (ipv4_hdr->version_ihl & 0xf) * 4; - - cksum = rte_raw_cksum(l4_hdr, l4_len); - cksum += rte_ipv4_phdr_cksum(ipv4_hdr, 0); --- -1.8.3.1 - diff --git a/patch/dpdk-stable-17.11.2/0003-pdump-support-filter.patch b/patch/dpdk-stable-17.11.2/0003-pdump-support-filter.patch deleted file mode 100644 index 0a4ed7e80..000000000 --- a/patch/dpdk-stable-17.11.2/0003-pdump-support-filter.patch +++ /dev/null @@ -1,926 +0,0 @@ -diff -uparN /home/weiyanhua/dpvs/dpdk-stable-17.11.2/app/pdump/main.c dpdk-stable-17.11.2/app/pdump/main.c ---- /home/weiyanhua/dpvs/dpdk-stable-17.11.2/app/pdump/main.c 2018-04-19 22:09:22.000000000 +0800 -+++ dpdk-stable-17.11.2/app/pdump/main.c 2019-10-17 18:57:38.979759023 +0800 -@@ -54,6 +54,7 @@ - #include - #include - #include -+#include - - #define CMD_LINE_OPT_PDUMP "pdump" - #define PDUMP_PORT_ARG "port" -@@ -65,6 +66,13 @@ - #define PDUMP_RING_SIZE_ARG "ring-size" - #define PDUMP_MSIZE_ARG "mbuf-size" - #define PDUMP_NUM_MBUFS_ARG "total-num-mbufs" -+#define PDUMP_HOST_ARG "host" -+#define PDUMP_SRC_ARG "src-host" -+#define PDUMP_DST_ARG "dst-host" -+#define PDUMP_PROTO_PORT_AGE "proto-port" -+#define PDUMP_SPORT_ARG "src-port" -+#define PDUMP_DPORT_ARG "dst-port" -+#define PDUMP_PROTO_ARG "proto" - #define CMD_LINE_OPT_SER_SOCK_PATH "server-socket-path" - #define CMD_LINE_OPT_CLI_SOCK_PATH "client-socket-path" - -@@ -120,6 +128,13 @@ const char *valid_pdump_arguments[] = { - PDUMP_RING_SIZE_ARG, - PDUMP_MSIZE_ARG, - PDUMP_NUM_MBUFS_ARG, -+ PDUMP_HOST_ARG, -+ PDUMP_SRC_ARG, -+ PDUMP_DST_ARG, -+ PDUMP_PROTO_PORT_AGE, -+ PDUMP_SPORT_ARG, -+ PDUMP_DPORT_ARG, -+ PDUMP_PROTO_ARG, - NULL - }; - -@@ -153,6 +168,7 @@ struct pdump_tuples { - enum pcap_stream rx_vdev_stream_type; - enum pcap_stream tx_vdev_stream_type; - bool single_pdump_dev; -+ struct pdump_filter *filter; - - /* stats */ - struct pdump_stats stats; -@@ -180,6 +196,11 @@ pdump_usage(const char *prgname) - "(queue=)," - "(rx-dev= |" - " tx-dev=," -+ "[host= | src-host= |" -+ "dst-host=]," -+ "[proto=support:tcp/udp/icmp]," -+ "[proto-port= |src-port= |" -+ "dst-port=]," - "[ring-size=default:16384]," - "[mbuf-size=default:2176]," - "[total-num-mbufs=default:65535]'\n" -@@ -270,6 +291,64 @@ parse_uint_value(const char *key, const - } - - static int -+parse_host(const char *key __rte_unused, const char *value, void *extra_args) -+{ -+ struct pdump_tuples *pt = extra_args; -+ struct in_addr inaddr; -+ struct in6_addr inaddr6; -+ union addr addr; -+ int af = 0; -+ -+ if (inet_pton(AF_INET6, value, &inaddr6) > 0) { -+ af = AF_INET6; -+ addr.in6 = inaddr6; -+ } else if (inet_pton(AF_INET, value, &inaddr) > 0){ -+ af = AF_INET; -+ addr.in = inaddr; -+ } else { -+ printf("IP address invaled\n"); -+ return -EINVAL; -+ } -+ -+ if (pt->filter && pt->filter->af != 0 && af != pt->filter->af) { -+ printf("IPv4 and IPv6 conflict\n"); -+ return -EINVAL; -+ } else { -+ pt->filter->af = af; -+ } -+ -+ if (!strcmp(key, PDUMP_HOST_ARG)) { -+ rte_memcpy(&pt->filter->host_addr, &addr, sizeof(addr)); -+ } else if (!strcmp(key, PDUMP_SRC_ARG)) { -+ rte_memcpy(&pt->filter->s_addr, &addr, sizeof(addr)); -+ } else if (!strcmp(key, PDUMP_DST_ARG)) { -+ rte_memcpy(&pt->filter->d_addr, &addr, sizeof(addr)); -+ } -+ -+ return 0; -+} -+ -+static int -+parse_proto(const char *key __rte_unused, const char *value, void *extra_args) -+{ -+ struct pdump_tuples *pt = extra_args; -+ -+ if (!strcmp(value, "tcp")) { -+ pt->filter->proto = IPPROTO_TCP; -+ } else if (!strcmp(value, "udp")) { -+ pt->filter->proto = IPPROTO_UDP; -+ } else if (!strcmp(value, "icmp")) { -+ pt->filter->proto = IPPROTO_ICMP; -+ } else { -+ printf("invalid value:\"%s\" for key:\"%s\", " -+ "value must be tcp/udp/icmp\n", value, key); -+ return -EINVAL; -+ } -+ -+ return 0; -+} -+ -+static int - parse_pdump(const char *optarg) - { - struct rte_kvargs *kvlist; -@@ -396,6 +475,75 @@ parse_pdump(const char *optarg) - } else - pt->total_num_mbufs = MBUFS_PER_POOL; - -+ /* filter parsing and validation */ -+ pt->filter = rte_zmalloc("pdump_filter", -+ sizeof(struct pdump_filter), 0); -+ -+ cnt1 = rte_kvargs_count(kvlist, PDUMP_HOST_ARG); -+ if (cnt1 == 1) { -+ ret = rte_kvargs_process(kvlist, PDUMP_HOST_ARG, -+ &parse_host, pt); -+ if (ret < 0) -+ goto free_kvlist; -+ } -+ -+ cnt1 = rte_kvargs_count(kvlist, PDUMP_SRC_ARG); -+ if (cnt1 == 1) { -+ ret = rte_kvargs_process(kvlist, PDUMP_SRC_ARG, -+ &parse_host, pt); -+ if (ret < 0) -+ goto free_kvlist; -+ } -+ -+ cnt1 = rte_kvargs_count(kvlist, PDUMP_DST_ARG); -+ if (cnt1 == 1) { -+ ret = rte_kvargs_process(kvlist, PDUMP_DST_ARG, -+ &parse_host, pt); -+ if (ret < 0) -+ goto free_kvlist; -+ } -+ -+ cnt1 = rte_kvargs_count(kvlist, PDUMP_PROTO_PORT_AGE); -+ if (cnt1 == 1) { -+ v.min = 1; -+ v.max = UINT16_MAX; -+ ret = rte_kvargs_process(kvlist, PDUMP_PROTO_PORT_AGE, -+ &parse_uint_value, &v); -+ if (ret < 0) -+ goto free_kvlist; -+ pt->filter->proto_port = (uint16_t) v.val; -+ } -+ -+ cnt1 = rte_kvargs_count(kvlist, PDUMP_SPORT_ARG); -+ if (cnt1 == 1) { -+ v.min = 1; -+ v.max = UINT16_MAX; -+ ret = rte_kvargs_process(kvlist, PDUMP_SPORT_ARG, -+ &parse_uint_value, &v); -+ if (ret < 0) -+ goto free_kvlist; -+ pt->filter->s_port = (uint16_t) v.val; -+ } -+ -+ cnt1 = rte_kvargs_count(kvlist, PDUMP_DPORT_ARG); -+ if (cnt1 == 1) { -+ v.min = 1; -+ v.max = UINT16_MAX; -+ ret = rte_kvargs_process(kvlist, PDUMP_DPORT_ARG, -+ &parse_uint_value, &v); -+ if (ret < 0) -+ goto free_kvlist; -+ pt->filter->d_port = (uint16_t) v.val; -+ } -+ -+ cnt1 = rte_kvargs_count(kvlist, PDUMP_PROTO_ARG); -+ if (cnt1 == 1) { -+ ret = rte_kvargs_process(kvlist, PDUMP_PROTO_ARG, -+ &parse_proto, pt); -+ if (ret < 0) -+ goto free_kvlist; -+ } -+ - num_tuples++; - - free_kvlist: -@@ -540,6 +688,8 @@ cleanup_rings(void) - rte_ring_free(pt->rx_ring); - if (pt->tx_ring) - rte_ring_free(pt->tx_ring); -+ if (pt->filter) -+ rte_free(pt->filter); - } - } - -@@ -583,11 +733,10 @@ configure_vdev(uint16_t port_id) - { - struct ether_addr addr; - const uint16_t rxRings = 0, txRings = 1; -- const uint8_t nb_ports = rte_eth_dev_count(); - int ret; - uint16_t q; - -- if (port_id > nb_ports) -+ if (!rte_eth_dev_is_valid_port(port_id)) - return -1; - - ret = rte_eth_dev_configure(port_id, rxRings, txRings, -@@ -799,20 +948,20 @@ enable_pdump(void) - pt->queue, - RTE_PDUMP_FLAG_RX, - pt->rx_ring, -- pt->mp, NULL); -+ pt->mp, pt->filter); - ret1 = rte_pdump_enable_by_deviceid( - pt->device_id, - pt->queue, - RTE_PDUMP_FLAG_TX, - pt->tx_ring, -- pt->mp, NULL); -+ pt->mp, pt->filter); - } else if (pt->dump_by_type == PORT_ID) { - ret = rte_pdump_enable(pt->port, pt->queue, - RTE_PDUMP_FLAG_RX, -- pt->rx_ring, pt->mp, NULL); -+ pt->rx_ring, pt->mp, pt->filter); - ret1 = rte_pdump_enable(pt->port, pt->queue, - RTE_PDUMP_FLAG_TX, -- pt->tx_ring, pt->mp, NULL); -+ pt->tx_ring, pt->mp, pt->filter); - } - } else if (pt->dir == RTE_PDUMP_FLAG_RX) { - if (pt->dump_by_type == DEVICE_ID) -@@ -820,22 +969,22 @@ enable_pdump(void) - pt->device_id, - pt->queue, - pt->dir, pt->rx_ring, -- pt->mp, NULL); -+ pt->mp, pt->filter); - else if (pt->dump_by_type == PORT_ID) - ret = rte_pdump_enable(pt->port, pt->queue, - pt->dir, -- pt->rx_ring, pt->mp, NULL); -+ pt->rx_ring, pt->mp, pt->filter); - } else if (pt->dir == RTE_PDUMP_FLAG_TX) { - if (pt->dump_by_type == DEVICE_ID) - ret = rte_pdump_enable_by_deviceid( - pt->device_id, - pt->queue, - pt->dir, -- pt->tx_ring, pt->mp, NULL); -+ pt->tx_ring, pt->mp, pt->filter); - else if (pt->dump_by_type == PORT_ID) - ret = rte_pdump_enable(pt->port, pt->queue, - pt->dir, -- pt->tx_ring, pt->mp, NULL); -+ pt->tx_ring, pt->mp, pt->filter); - } - if (ret < 0 || ret1 < 0) { - cleanup_pdump_resources(); -diff -uparN /home/weiyanhua/dpvs/dpdk-stable-17.11.2/app/pdump/Makefile dpdk-stable-17.11.2/app/pdump/Makefile ---- /home/weiyanhua/dpvs/dpdk-stable-17.11.2/app/pdump/Makefile 2018-04-19 22:09:22.000000000 +0800 -+++ dpdk-stable-17.11.2/app/pdump/Makefile 2019-10-16 20:21:23.939178027 +0800 -@@ -41,6 +41,6 @@ CFLAGS += $(WERROR_FLAGS) - - SRCS-y := main.c - --include $(RTE_SDK)/mk/rte.app.mk -+include $(RTE_SDK)/mk/rte.pdump.mk - - endif -diff -uparN /home/weiyanhua/dpvs/dpdk-stable-17.11.2/config/common_base dpdk-stable-17.11.2/config/common_base ---- /home/weiyanhua/dpvs/dpdk-stable-17.11.2/config/common_base 2018-04-19 22:09:22.000000000 +0800 -+++ dpdk-stable-17.11.2/config/common_base 2019-10-16 20:21:23.940178039 +0800 -@@ -397,7 +397,7 @@ CONFIG_RTE_PMD_RING_MAX_TX_RINGS=16 - # - # Compile software PMD backed by PCAP files - # --CONFIG_RTE_LIBRTE_PMD_PCAP=n -+CONFIG_RTE_LIBRTE_PMD_PCAP=y - - # - # Compile link bonding PMD library -diff -uparN /home/weiyanhua/dpvs/dpdk-stable-17.11.2/lib/librte_pdump/rte_pdump.c dpdk-stable-17.11.2/lib/librte_pdump/rte_pdump.c ---- /home/weiyanhua/dpvs/dpdk-stable-17.11.2/lib/librte_pdump/rte_pdump.c 2018-04-19 22:09:22.000000000 +0800 -+++ dpdk-stable-17.11.2/lib/librte_pdump/rte_pdump.c 2019-10-17 18:07:28.821435563 +0800 -@@ -46,6 +46,10 @@ - #include - #include - #include -+#include -+#include -+#include -+#include - - #include "rte_pdump.h" - -@@ -177,6 +181,132 @@ pdump_pktmbuf_copy(struct rte_mbuf *m, s - return m_dup; - } - -+static bool -+inet_addr_equal(int af, const union addr *a1, -+ const union addr *a2) -+{ -+ switch (af) { -+ case AF_INET: -+ return a1->in.s_addr == a2->in.s_addr; -+ case AF_INET6: -+ return memcmp(a1->in6.s6_addr, a2->in6.s6_addr, 16) == 0; -+ default: -+ return memcmp(a1, a2, sizeof(union addr)) == 0; -+ } -+} -+ -+static bool -+inet_is_addr_any(int af, const union addr *addr) -+{ -+ switch (af) { -+ case AF_INET: -+ return addr->in.s_addr == htonl(INADDR_ANY); -+ case AF_INET6: -+ return IN6_ARE_ADDR_EQUAL(&addr->in6, &in6addr_any); -+ default: -+ return false; -+ } -+} -+ -+/* support vlan/arp/ipv4/ipv6 */ -+static int -+pdump_filter(struct rte_mbuf *m, struct pdump_filter *filter) -+{ -+ struct ether_hdr *eth_hdr; -+ struct vlan_eth_hdr *vlan_eth_hdr; -+ union addr s_addr, d_addr; -+ int prepend = 0; -+ uint16_t type = 0; -+ uint16_t iph_len = 0; -+ uint8_t proto = 0; -+ -+ int af; -+ -+ if (filter->af == 0 && filter->s_port == 0 && -+ filter->d_port == 0 && filter->proto == 0) -+ return 0; -+ -+ eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); -+ -+ if (eth_hdr->ether_type == htons(ETH_P_8021Q)) { -+ prepend += sizeof(struct vlan_eth_hdr); -+ vlan_eth_hdr = rte_pktmbuf_mtod(m, struct vlan_eth_hdr *); -+ type = vlan_eth_hdr->h_vlan_encapsulated_proto; -+ } else { -+ prepend += sizeof(struct ether_hdr); -+ eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); -+ type = eth_hdr->ether_type; -+ } -+ -+ if (rte_pktmbuf_adj(m, prepend) == NULL) -+ goto prepend; -+ -+ if (type == rte_cpu_to_be_16(ETHER_TYPE_ARP)) { -+ struct arp_hdr *arp = rte_pktmbuf_mtod(m, struct arp_hdr *); -+ af = AF_INET; -+ s_addr.in.s_addr = arp->arp_data.arp_sip; -+ d_addr.in.s_addr = arp->arp_data.arp_tip; -+ } else if (type == rte_cpu_to_be_16(ETHER_TYPE_IPv4)) { -+ struct ipv4_hdr *ip4 = rte_pktmbuf_mtod(m, struct ipv4_hdr *); -+ af = AF_INET; -+ s_addr.in.s_addr = ip4->src_addr; -+ d_addr.in.s_addr = ip4->dst_addr; -+ proto = ip4->next_proto_id; -+ iph_len = (ip4->version_ihl & 0xf) << 2; -+ } else if (type == rte_cpu_to_be_16(ETHER_TYPE_IPv6)) { -+ struct ipv6_hdr *ip6 = rte_pktmbuf_mtod(m, struct ipv6_hdr *); -+ af = AF_INET6; -+ rte_memcpy(&s_addr.in6, &ip6->src_addr, 16); -+ rte_memcpy(&d_addr.in6, &ip6->dst_addr, 16); -+ proto = ip6->proto; -+ iph_len = sizeof(struct ipv6_hdr); -+ } else { -+ goto prepend; -+ } -+ -+ /*filter*/ -+ if (!inet_is_addr_any(af, &filter->s_addr) && -+ !inet_addr_equal(af, &filter->s_addr, &s_addr)) -+ goto prepend; -+ if (!inet_is_addr_any(af, &filter->d_addr) && -+ !inet_addr_equal(af, &filter->d_addr, &d_addr)) -+ goto prepend; -+ if (!inet_is_addr_any(af, &filter->host_addr) && -+ !inet_addr_equal(af, &filter->host_addr, &s_addr) && -+ !inet_addr_equal(af, &filter->host_addr, &d_addr)) -+ goto prepend; -+ -+ if (filter->proto && filter->proto != proto) -+ goto prepend; -+ -+ if (filter->s_port || filter->d_port) { -+ if (proto != IPPROTO_TCP && proto != IPPROTO_UDP) -+ goto prepend; -+ struct udp_hdr _uh; -+ const struct udp_hdr *uh; -+ uh = rte_pktmbuf_read(m, iph_len, sizeof(_uh), &_uh); -+ if (uh == NULL) -+ goto prepend; -+ if (filter->s_port && filter->s_port != rte_cpu_to_be_16(uh->src_port)) -+ goto prepend; -+ -+ if (filter->d_port && filter->d_port != rte_cpu_to_be_16(uh->dst_port)) -+ goto prepend; -+ -+ if (filter->proto_port && -+ filter->proto_port != rte_cpu_to_be_16(uh->src_port) && -+ filter->proto_port != rte_cpu_to_be_16(uh->dst_port)) -+ goto prepend; -+ } -+ -+ rte_pktmbuf_prepend(m, prepend); -+ return 0; -+ -+prepend: -+ rte_pktmbuf_prepend(m, prepend); -+ return -1; -+} -+ - static inline void - pdump_copy(struct rte_mbuf **pkts, uint16_t nb_pkts, void *user_params) - { -@@ -193,6 +323,8 @@ pdump_copy(struct rte_mbuf **pkts, uint1 - ring = cbs->ring; - mp = cbs->mp; - for (i = 0; i < nb_pkts; i++) { -+ if (pdump_filter(pkts[i], cbs->filter) != 0) -+ continue; - p = pdump_pktmbuf_copy(pkts[i], mp); - if (p) - dup_bufs[d_pkts++] = p; -@@ -229,7 +361,7 @@ pdump_tx(uint16_t port __rte_unused, uin - static int - pdump_register_rx_callbacks(uint16_t end_q, uint16_t port, uint16_t queue, - struct rte_ring *ring, struct rte_mempool *mp, -- uint16_t operation) -+ struct pdump_filter *filter, uint16_t operation) - { - uint16_t qid; - struct pdump_rxtx_cbs *cbs = NULL; -@@ -247,6 +379,7 @@ pdump_register_rx_callbacks(uint16_t end - } - cbs->ring = ring; - cbs->mp = mp; -+ cbs->filter = filter; - cbs->cb = rte_eth_add_first_rx_callback(port, qid, - pdump_rx, cbs); - if (cbs->cb == NULL) { -@@ -283,7 +416,7 @@ pdump_register_rx_callbacks(uint16_t end - static int - pdump_register_tx_callbacks(uint16_t end_q, uint16_t port, uint16_t queue, - struct rte_ring *ring, struct rte_mempool *mp, -- uint16_t operation) -+ struct pdump_filter *filter, uint16_t operation) - { - - uint16_t qid; -@@ -302,6 +435,7 @@ pdump_register_tx_callbacks(uint16_t end - } - cbs->ring = ring; - cbs->mp = mp; -+ cbs->filter = filter; - cbs->cb = rte_eth_add_tx_callback(port, qid, pdump_tx, - cbs); - if (cbs->cb == NULL) { -@@ -345,6 +479,7 @@ set_pdump_rxtx_cbs(struct pdump_request - uint16_t operation; - struct rte_ring *ring; - struct rte_mempool *mp; -+ struct pdump_filter *filter; - - flags = p->flags; - operation = p->op; -@@ -360,6 +495,7 @@ set_pdump_rxtx_cbs(struct pdump_request - queue = p->data.en_v1.queue; - ring = p->data.en_v1.ring; - mp = p->data.en_v1.mp; -+ filter = p->data.en_v1.filter; - } else { - ret = rte_eth_dev_get_port_by_name(p->data.dis_v1.device, - &port); -@@ -372,6 +508,7 @@ set_pdump_rxtx_cbs(struct pdump_request - queue = p->data.dis_v1.queue; - ring = p->data.dis_v1.ring; - mp = p->data.dis_v1.mp; -+ filter = p->data.dis_v1.filter; - } - - /* validation if packet capture is for all queues */ -@@ -403,7 +540,7 @@ set_pdump_rxtx_cbs(struct pdump_request - if (flags & RTE_PDUMP_FLAG_RX) { - end_q = (queue == RTE_PDUMP_ALL_QUEUES) ? nb_rx_q : queue + 1; - ret = pdump_register_rx_callbacks(end_q, port, queue, ring, mp, -- operation); -+ filter, operation); - if (ret < 0) - return ret; - } -@@ -412,7 +549,7 @@ set_pdump_rxtx_cbs(struct pdump_request - if (flags & RTE_PDUMP_FLAG_TX) { - end_q = (queue == RTE_PDUMP_ALL_QUEUES) ? nb_tx_q : queue + 1; - ret = pdump_register_tx_callbacks(end_q, port, queue, ring, mp, -- operation); -+ filter, operation); - if (ret < 0) - return ret; - } -diff -uparN /home/weiyanhua/dpvs/dpdk-stable-17.11.2/lib/librte_pdump/rte_pdump.h dpdk-stable-17.11.2/lib/librte_pdump/rte_pdump.h ---- /home/weiyanhua/dpvs/dpdk-stable-17.11.2/lib/librte_pdump/rte_pdump.h 2018-04-19 22:09:22.000000000 +0800 -+++ dpdk-stable-17.11.2/lib/librte_pdump/rte_pdump.h 2019-10-17 17:54:59.401175031 +0800 -@@ -44,6 +44,8 @@ - #include - #include - #include -+#include -+#include - - #ifdef __cplusplus - extern "C" { -@@ -63,6 +65,31 @@ enum rte_pdump_socktype { - RTE_PDUMP_SOCKET_CLIENT = 2 - }; - -+union addr { -+ struct in_addr in; -+ struct in6_addr in6; -+}; -+ -+struct pdump_filter { -+ int af; -+ union addr s_addr; -+ union addr d_addr; -+ union addr host_addr; //s_addr or d_addr -+ -+ uint8_t proto; -+ uint16_t proto_port; //s_port or d_port -+ uint16_t s_port; -+ uint16_t d_port; -+}; -+ -+struct vlan_eth_hdr { -+ unsigned char h_dest[ETH_ALEN]; -+ unsigned char h_source[ETH_ALEN]; -+ unsigned short h_vlan_proto; -+ unsigned short h_vlan_TCI; -+ unsigned short h_vlan_encapsulated_proto; -+}; -+ - /** - * Initialize packet capturing handling - * -diff -uparN /home/weiyanhua/dpvs/dpdk-stable-17.11.2/mk/rte.pdump.mk dpdk-stable-17.11.2/mk/rte.pdump.mk ---- /home/weiyanhua/dpvs/dpdk-stable-17.11.2/mk/rte.pdump.mk 1970-01-01 08:00:00.000000000 +0800 -+++ dpdk-stable-17.11.2/mk/rte.pdump.mk 2019-10-16 20:21:23.941178051 +0800 -@@ -0,0 +1,349 @@ -+# BSD LICENSE -+# -+# Copyright(c) 2010-2017 Intel Corporation. All rights reserved. -+# Copyright(c) 2014-2015 6WIND S.A. -+# All rights reserved. -+# -+# Redistribution and use in source and binary forms, with or without -+# modification, are permitted provided that the following conditions -+# are met: -+# -+# * Redistributions of source code must retain the above copyright -+# notice, this list of conditions and the following disclaimer. -+# * Redistributions in binary form must reproduce the above copyright -+# notice, this list of conditions and the following disclaimer in -+# the documentation and/or other materials provided with the -+# distribution. -+# * Neither the name of Intel Corporation nor the names of its -+# contributors may be used to endorse or promote products derived -+# from this software without specific prior written permission. -+# -+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+ -+include $(RTE_SDK)/mk/internal/rte.compile-pre.mk -+include $(RTE_SDK)/mk/internal/rte.install-pre.mk -+include $(RTE_SDK)/mk/internal/rte.clean-pre.mk -+include $(RTE_SDK)/mk/internal/rte.build-pre.mk -+ -+# VPATH contains at least SRCDIR -+VPATH += $(SRCDIR) -+ -+_BUILD = $(APP) -+_INSTALL = $(INSTALL-FILES-y) $(SYMLINK-FILES-y) -+_INSTALL += $(RTE_OUTPUT)/app/$(APP) $(RTE_OUTPUT)/app/$(APP).map -+POSTINSTALL += target-appinstall -+_CLEAN = doclean -+POSTCLEAN += target-appclean -+ -+ifeq ($(NO_LDSCRIPT),) -+LDSCRIPT = $(RTE_LDSCRIPT) -+endif -+ -+# Link only the libraries used in the application -+LDFLAGS += --as-needed -+ -+# default path for libs -+_LDLIBS-y += -L$(RTE_SDK_BIN)/lib -+ -+# -+# Order is important: from higher level to lower level -+# -+_LDLIBS-$(CONFIG_RTE_LIBRTE_FLOW_CLASSIFY) += -lrte_flow_classify -+_LDLIBS-$(CONFIG_RTE_LIBRTE_PIPELINE) += -lrte_pipeline -+_LDLIBS-$(CONFIG_RTE_LIBRTE_TABLE) += -lrte_table -+_LDLIBS-$(CONFIG_RTE_LIBRTE_PORT) += -lrte_port -+ -+_LDLIBS-$(CONFIG_RTE_LIBRTE_PDUMP) += -lrte_pdump -+_LDLIBS-$(CONFIG_RTE_LIBRTE_DISTRIBUTOR) += -lrte_distributor -+_LDLIBS-$(CONFIG_RTE_LIBRTE_IP_FRAG) += -lrte_ip_frag -+_LDLIBS-$(CONFIG_RTE_LIBRTE_GRO) += -lrte_gro -+_LDLIBS-$(CONFIG_RTE_LIBRTE_GSO) += -lrte_gso -+_LDLIBS-$(CONFIG_RTE_LIBRTE_METER) += -lrte_meter -+_LDLIBS-$(CONFIG_RTE_LIBRTE_LPM) += -lrte_lpm -+# librte_acl needs --whole-archive because of weak functions -+_LDLIBS-$(CONFIG_RTE_LIBRTE_ACL) += --whole-archive -+_LDLIBS-$(CONFIG_RTE_LIBRTE_ACL) += --no-whole-archive -+_LDLIBS-$(CONFIG_RTE_LIBRTE_JOBSTATS) += -lrte_jobstats -+_LDLIBS-$(CONFIG_RTE_LIBRTE_METRICS) += -lrte_metrics -+_LDLIBS-$(CONFIG_RTE_LIBRTE_BITRATE) += -lrte_bitratestats -+_LDLIBS-$(CONFIG_RTE_LIBRTE_LATENCY_STATS) += -lrte_latencystats -+_LDLIBS-$(CONFIG_RTE_LIBRTE_POWER) += -lrte_power -+ -+_LDLIBS-$(CONFIG_RTE_LIBRTE_TIMER) += -lrte_timer -+_LDLIBS-$(CONFIG_RTE_LIBRTE_EFD) += -lrte_efd -+ -+_LDLIBS-y += --whole-archive -+ -+_LDLIBS-$(CONFIG_RTE_LIBRTE_CFGFILE) += -lrte_cfgfile -+_LDLIBS-$(CONFIG_RTE_LIBRTE_HASH) += -lrte_hash -+_LDLIBS-$(CONFIG_RTE_LIBRTE_VHOST) += -lrte_vhost -+_LDLIBS-$(CONFIG_RTE_LIBRTE_KVARGS) += -lrte_kvargs -+_LDLIBS-$(CONFIG_RTE_LIBRTE_MBUF) += -lrte_mbuf -+_LDLIBS-$(CONFIG_RTE_LIBRTE_NET) += -lrte_net -+_LDLIBS-$(CONFIG_RTE_LIBRTE_ETHER) += -lrte_ethdev -+_LDLIBS-$(CONFIG_RTE_LIBRTE_CRYPTODEV) += -lrte_cryptodev -+_LDLIBS-$(CONFIG_RTE_LIBRTE_SECURITY) += -lrte_security -+_LDLIBS-$(CONFIG_RTE_LIBRTE_MEMPOOL) += -lrte_mempool -+_LDLIBS-$(CONFIG_RTE_DRIVER_MEMPOOL_RING) += -lrte_mempool_ring -+_LDLIBS-$(CONFIG_RTE_LIBRTE_RING) += -lrte_ring -+_LDLIBS-$(CONFIG_RTE_LIBRTE_PCI) += -lrte_pci -+_LDLIBS-$(CONFIG_RTE_LIBRTE_EAL) += -lrte_eal -+_LDLIBS-$(CONFIG_RTE_LIBRTE_CMDLINE) += -lrte_cmdline -+_LDLIBS-$(CONFIG_RTE_LIBRTE_SCHED) += -lrte_sched -+ -+ifeq ($(CONFIG_RTE_EXEC_ENV_LINUXAPP),y) -+_LDLIBS-$(CONFIG_RTE_LIBRTE_KNI) += -lrte_kni -+endif -+ -+_LDLIBS-$(CONFIG_RTE_LIBRTE_PCI_BUS) += -lrte_bus_pci -+_LDLIBS-$(CONFIG_RTE_LIBRTE_VDEV_BUS) += -lrte_bus_vdev -+ -+ifeq ($(CONFIG_RTE_BUILD_SHARED_LIB),n) -+# plugins (link only if static libraries) -+ -+_LDLIBS-$(CONFIG_RTE_DRIVER_MEMPOOL_STACK) += -lrte_mempool_stack -+ -+_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_AF_PACKET) += -lrte_pmd_af_packet -+_LDLIBS-$(CONFIG_RTE_LIBRTE_ARK_PMD) += -lrte_pmd_ark -+_LDLIBS-$(CONFIG_RTE_LIBRTE_AVP_PMD) += -lrte_pmd_avp -+_LDLIBS-$(CONFIG_RTE_LIBRTE_BNX2X_PMD) += -lrte_pmd_bnx2x -lz -+_LDLIBS-$(CONFIG_RTE_LIBRTE_BNXT_PMD) += -lrte_pmd_bnxt -+_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_BOND) += -lrte_pmd_bond -+_LDLIBS-$(CONFIG_RTE_LIBRTE_CXGBE_PMD) += -lrte_pmd_cxgbe -+ifeq ($(CONFIG_RTE_LIBRTE_DPAA_BUS),y) -+_LDLIBS-$(CONFIG_RTE_LIBRTE_DPAA_BUS) += -lrte_bus_dpaa -+_LDLIBS-$(CONFIG_RTE_LIBRTE_DPAA_MEMPOOL) += -lrte_mempool_dpaa -+_LDLIBS-$(CONFIG_RTE_LIBRTE_DPAA_PMD) += -lrte_pmd_dpaa -+endif -+_LDLIBS-$(CONFIG_RTE_LIBRTE_DPAA2_PMD) += -lrte_pmd_dpaa2 -+_LDLIBS-$(CONFIG_RTE_LIBRTE_E1000_PMD) += -lrte_pmd_e1000 -+_LDLIBS-$(CONFIG_RTE_LIBRTE_ENA_PMD) += -lrte_pmd_ena -+_LDLIBS-$(CONFIG_RTE_LIBRTE_ENIC_PMD) += -lrte_pmd_enic -+_LDLIBS-$(CONFIG_RTE_LIBRTE_FM10K_PMD) += -lrte_pmd_fm10k -+_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_FAILSAFE) += -lrte_pmd_failsafe -+_LDLIBS-$(CONFIG_RTE_LIBRTE_I40E_PMD) += -lrte_pmd_i40e -+_LDLIBS-$(CONFIG_RTE_LIBRTE_IXGBE_PMD) += -lrte_pmd_ixgbe -+ifeq ($(CONFIG_RTE_LIBRTE_KNI),y) -+_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_KNI) += -lrte_pmd_kni -+endif -+_LDLIBS-$(CONFIG_RTE_LIBRTE_LIO_PMD) += -lrte_pmd_lio -+_LDLIBS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += -lrte_pmd_mlx4 -libverbs -lmlx4 -+_LDLIBS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += -lrte_pmd_mlx5 -libverbs -lmlx5 -+_LDLIBS-$(CONFIG_RTE_LIBRTE_MRVL_PMD) += -lrte_pmd_mrvl -L$(LIBMUSDK_PATH)/lib -lmusdk -+_LDLIBS-$(CONFIG_RTE_LIBRTE_NFP_PMD) += -lrte_pmd_nfp -+_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_NULL) += -lrte_pmd_null -+_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_PCAP) += -lrte_pmd_pcap -lpcap -+_LDLIBS-$(CONFIG_RTE_LIBRTE_QEDE_PMD) += -lrte_pmd_qede -+_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_RING) += -lrte_pmd_ring -+ifeq ($(CONFIG_RTE_LIBRTE_SCHED),y) -+_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_SOFTNIC) += -lrte_pmd_softnic -+endif -+_LDLIBS-$(CONFIG_RTE_LIBRTE_SFC_EFX_PMD) += -lrte_pmd_sfc_efx -+_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_SZEDATA2) += -lrte_pmd_szedata2 -lsze2 -+_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += -lrte_pmd_tap -+_LDLIBS-$(CONFIG_RTE_LIBRTE_THUNDERX_NICVF_PMD) += -lrte_pmd_thunderx_nicvf -+_LDLIBS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += -lrte_pmd_virtio -+ifeq ($(CONFIG_RTE_LIBRTE_VHOST),y) -+_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_VHOST) += -lrte_pmd_vhost -+endif # $(CONFIG_RTE_LIBRTE_VHOST) -+_LDLIBS-$(CONFIG_RTE_LIBRTE_VMXNET3_PMD) += -lrte_pmd_vmxnet3_uio -+ -+ifeq ($(CONFIG_RTE_LIBRTE_CRYPTODEV),y) -+_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_AESNI_MB) += -lrte_pmd_aesni_mb -+_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_AESNI_MB) += -L$(AESNI_MULTI_BUFFER_LIB_PATH) -lIPSec_MB -+_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_AESNI_GCM) += -lrte_pmd_aesni_gcm -+_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_AESNI_GCM) += -L$(AESNI_MULTI_BUFFER_LIB_PATH) -lIPSec_MB -+_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_OPENSSL) += -lrte_pmd_openssl -lcrypto -+_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_NULL_CRYPTO) += -lrte_pmd_null_crypto -+_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_QAT) += -lrte_pmd_qat -lcrypto -+_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_SNOW3G) += -lrte_pmd_snow3g -+_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_SNOW3G) += -L$(LIBSSO_SNOW3G_PATH)/build -lsso_snow3g -+_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_KASUMI) += -lrte_pmd_kasumi -+_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_KASUMI) += -L$(LIBSSO_KASUMI_PATH)/build -lsso_kasumi -+_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_ZUC) += -lrte_pmd_zuc -+_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_ZUC) += -L$(LIBSSO_ZUC_PATH)/build -lsso_zuc -+_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_ARMV8_CRYPTO) += -lrte_pmd_armv8 -+_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_ARMV8_CRYPTO) += -L$(ARMV8_CRYPTO_LIB_PATH) -larmv8_crypto -+_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_MRVL_CRYPTO) += -L$(LIBMUSDK_PATH)/lib -lrte_pmd_mrvl_crypto -lmusdk -+_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_CRYPTO_SCHEDULER) += -lrte_pmd_crypto_scheduler -+ifeq ($(CONFIG_RTE_LIBRTE_FSLMC_BUS),y) -+_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_DPAA2_SEC) += -lrte_pmd_dpaa2_sec -+_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_DPAA2_SEC) += -lrte_mempool_dpaa2 -+_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_DPAA2_SEC) += -lrte_bus_fslmc -+endif # CONFIG_RTE_LIBRTE_FSLMC_BUS -+ -+ifeq ($(CONFIG_RTE_LIBRTE_DPAA_BUS),y) -+_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_DPAA_SEC) += -lrte_bus_dpaa -+_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_DPAA_SEC) += -lrte_pmd_dpaa_sec -+endif # CONFIG_RTE_LIBRTE_DPAA_BUS -+ -+endif # CONFIG_RTE_LIBRTE_CRYPTODEV -+ -+ifeq ($(CONFIG_RTE_LIBRTE_DPAA2_PMD),y) -+_LDLIBS-$(CONFIG_RTE_LIBRTE_DPAA2_PMD) += -lrte_bus_fslmc -+_LDLIBS-$(CONFIG_RTE_LIBRTE_DPAA2_PMD) += -lrte_mempool_dpaa2 -+endif # CONFIG_RTE_LIBRTE_DPAA2_PMD -+ -+endif # !CONFIG_RTE_BUILD_SHARED_LIBS -+ -+_LDLIBS-y += --no-whole-archive -+ -+ifeq ($(CONFIG_RTE_BUILD_SHARED_LIB),n) -+# The static libraries do not know their dependencies. -+# So linking with static library requires explicit dependencies. -+_LDLIBS-$(CONFIG_RTE_LIBRTE_EAL) += -lrt -+ifeq ($(CONFIG_RTE_EXEC_ENV_LINUXAPP)$(CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES),yy) -+_LDLIBS-$(CONFIG_RTE_LIBRTE_EAL) += -lnuma -+endif -+_LDLIBS-$(CONFIG_RTE_LIBRTE_SCHED) += -lm -+_LDLIBS-$(CONFIG_RTE_LIBRTE_SCHED) += -lrt -+_LDLIBS-$(CONFIG_RTE_LIBRTE_MEMBER) += -lm -+_LDLIBS-$(CONFIG_RTE_LIBRTE_METER) += -lm -+ifeq ($(CONFIG_RTE_LIBRTE_VHOST_NUMA),y) -+_LDLIBS-$(CONFIG_RTE_LIBRTE_VHOST) += -lnuma -+endif -+_LDLIBS-$(CONFIG_RTE_PORT_PCAP) += -lpcap -+endif # !CONFIG_RTE_BUILD_SHARED_LIBS -+ -+_LDLIBS-y += $(EXECENV_LDLIBS) -+ -+LDLIBS += $(_LDLIBS-y) $(CPU_LDLIBS) $(EXTRA_LDLIBS) -+ -+# all the words except the first one -+allbutfirst = $(wordlist 2,$(words $(1)),$(1)) -+ -+# Eliminate duplicates without sorting, only keep the last occurrence -+filter-libs = \ -+ $(if $(1),$(strip\ -+ $(if \ -+ $(and \ -+ $(filter $(firstword $(1)),$(call allbutfirst,$(1))),\ -+ $(filter -l%,$(firstword $(1)))),\ -+ ,\ -+ $(firstword $(1))) \ -+ $(call filter-libs,$(call allbutfirst,$(1))))) -+ -+LDLIBS := $(call filter-libs,$(LDLIBS)) -+ -+ifeq ($(RTE_DEVEL_BUILD)$(CONFIG_RTE_BUILD_SHARED_LIB),yy) -+LDFLAGS += -rpath=$(RTE_SDK_BIN)/lib -+endif -+ -+MAPFLAGS = -Map=$@.map --cref -+ -+.PHONY: all -+all: install -+ -+.PHONY: install -+install: build _postinstall -+ -+_postinstall: build -+ -+.PHONY: build -+build: _postbuild -+ -+exe2cmd = $(strip $(call dotfile,$(patsubst %,%.cmd,$(1)))) -+ -+ifeq ($(LINK_USING_CC),1) -+O_TO_EXE = $(CC) -o $@ $(CFLAGS) $(EXTRA_CFLAGS) $(OBJS-y) $(call linkerprefix, \ -+ $(LDLIBS) $(LDFLAGS) $(LDFLAGS_$(@)) $(EXTRA_LDFLAGS) \ -+ $(MAPFLAGS)) -+else -+O_TO_EXE = $(LD) -o $@ $(OBJS-y) \ -+ $(LDLIBS) $(LDFLAGS) $(LDFLAGS_$(@)) $(EXTRA_LDFLAGS) \ -+ $(MAPFLAGS) -+endif -+O_TO_EXE_STR = $(subst ','\'',$(O_TO_EXE)) #'# fix syntax highlight -+O_TO_EXE_DISP = $(if $(V),"$(O_TO_EXE_STR)"," LD $(@)") -+O_TO_EXE_CMD = "cmd_$@ = $(O_TO_EXE_STR)" -+O_TO_EXE_DO = @set -e; \ -+ echo $(O_TO_EXE_DISP); \ -+ $(O_TO_EXE) && \ -+ echo $(O_TO_EXE_CMD) > $(call exe2cmd,$(@)) -+ -+-include .$(APP).cmd -+ -+# path where libraries are retrieved -+LDLIBS_PATH := $(subst -Wl$(comma)-L,,$(filter -Wl$(comma)-L%,$(LDLIBS))) -+LDLIBS_PATH += $(subst -L,,$(filter -L%,$(LDLIBS))) -+ -+# list of .a files that are linked to this application -+LDLIBS_NAMES := $(patsubst -l%,lib%.a,$(filter -l%,$(LDLIBS))) -+LDLIBS_NAMES += $(patsubst -Wl$(comma)-l%,lib%.a,$(filter -Wl$(comma)-l%,$(LDLIBS))) -+ -+# list of found libraries files (useful for deps). If not found, the -+# library is silently ignored and dep won't be checked -+LDLIBS_FILES := $(sort $(wildcard $(foreach dir,$(LDLIBS_PATH),\ -+ $(addprefix $(dir)/,$(LDLIBS_NAMES))))) -+ -+# -+# Compile executable file if needed -+# -+$(APP): $(OBJS-y) $(LDLIBS_FILES) $(DEP_$(APP)) $(LDSCRIPT) FORCE -+ @[ -d $(dir $@) ] || mkdir -p $(dir $@) -+ $(if $(D),\ -+ @echo -n "$< -> $@ " ; \ -+ echo -n "file_missing=$(call boolean,$(file_missing)) " ; \ -+ echo -n "cmdline_changed=$(call boolean,$(call cmdline_changed,$(O_TO_EXE_STR))) " ; \ -+ echo -n "depfile_missing=$(call boolean,$(depfile_missing)) " ; \ -+ echo "depfile_newer=$(call boolean,$(depfile_newer)) ") -+ $(if $(or \ -+ $(file_missing),\ -+ $(call cmdline_changed,$(O_TO_EXE_STR)),\ -+ $(depfile_missing),\ -+ $(depfile_newer)),\ -+ $(O_TO_EXE_DO)) -+ -+# -+# install app in $(RTE_OUTPUT)/app -+# -+$(RTE_OUTPUT)/app/$(APP): $(APP) -+ @echo " INSTALL-APP $(APP)" -+ @[ -d $(RTE_OUTPUT)/app ] || mkdir -p $(RTE_OUTPUT)/app -+ $(Q)cp -f $(APP) $(RTE_OUTPUT)/app -+ -+# -+# install app map file in $(RTE_OUTPUT)/app -+# -+$(RTE_OUTPUT)/app/$(APP).map: $(APP) -+ @echo " INSTALL-MAP $(APP).map" -+ @[ -d $(RTE_OUTPUT)/app ] || mkdir -p $(RTE_OUTPUT)/app -+ $(Q)cp -f $(APP).map $(RTE_OUTPUT)/app -+ -+# -+# Clean all generated files -+# -+.PHONY: clean -+clean: _postclean -+ $(Q)rm -f $(_BUILD_TARGETS) $(_INSTALL_TARGETS) $(_CLEAN_TARGETS) -+ -+.PHONY: doclean -+doclean: -+ $(Q)rm -rf $(APP) $(OBJS-all) $(DEPS-all) $(DEPSTMP-all) \ -+ $(CMDS-all) $(INSTALL-FILES-all) .$(APP).cmd $(APP).map -+ -+ -+include $(RTE_SDK)/mk/internal/rte.compile-post.mk -+include $(RTE_SDK)/mk/internal/rte.install-post.mk -+include $(RTE_SDK)/mk/internal/rte.clean-post.mk -+include $(RTE_SDK)/mk/internal/rte.build-post.mk -+ -+ifneq ($(wildcard $(RTE_SDK)/mk/target/$(RTE_TARGET)/rte.app.mk),) -+include $(RTE_SDK)/mk/target/$(RTE_TARGET)/rte.app.mk -+else -+include $(RTE_SDK)/mk/target/generic/rte.app.mk -+endif -+ -+.PHONY: FORCE -+FORCE: -+ diff --git a/patch/dpdk-stable-17.11.2/0004-patch-dpdk-17.11.2-to-support-rh75-provided-by-Jason.patch b/patch/dpdk-stable-17.11.2/0004-patch-dpdk-17.11.2-to-support-rh75-provided-by-Jason.patch deleted file mode 100644 index 3cb205562..000000000 --- a/patch/dpdk-stable-17.11.2/0004-patch-dpdk-17.11.2-to-support-rh75-provided-by-Jason.patch +++ /dev/null @@ -1,48 +0,0 @@ -From 95e115dd2bfe5f7a7e54af0f73577af0a68fdba0 Mon Sep 17 00:00:00 2001 -From: wencyu -Date: Wed, 13 Nov 2019 10:17:35 +0800 -Subject: [PATCH 4/4] patch dpdk-17.11.2 to support rh75 (provided by Jason Joo - ) - ---- - lib/librte_eal/linuxapp/kni/compat.h | 6 ++++++ - lib/librte_eal/linuxapp/kni/ethtool/igb/kcompat.h | 6 ++++++ - 2 files changed, 12 insertions(+) - -diff --git a/lib/librte_eal/linuxapp/kni/compat.h b/lib/librte_eal/linuxapp/kni/compat.h -index 3f8c0bc..be707bc 100644 ---- a/lib/librte_eal/linuxapp/kni/compat.h -+++ b/lib/librte_eal/linuxapp/kni/compat.h -@@ -101,6 +101,12 @@ - #undef NET_NAME_UNKNOWN - #endif - -+#if (defined(RHEL_RELEASE_CODE) && \ -+ (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7, 5)) && \ -+ (RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(8, 0))) -+#define ndo_change_mtu ndo_change_mtu_rh74 -+#endif -+ - #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0) - #define HAVE_SIGNAL_FUNCTIONS_OWN_HEADER - #endif -diff --git a/lib/librte_eal/linuxapp/kni/ethtool/igb/kcompat.h b/lib/librte_eal/linuxapp/kni/ethtool/igb/kcompat.h -index 443a3f2..46cad90 100644 ---- a/lib/librte_eal/linuxapp/kni/ethtool/igb/kcompat.h -+++ b/lib/librte_eal/linuxapp/kni/ethtool/igb/kcompat.h -@@ -3932,6 +3932,12 @@ static inline struct sk_buff *__kc__vlan_hwaccel_put_tag(struct sk_buff *skb, - #define vlan_tx_tag_present skb_vlan_tag_present - #endif - -+#if (defined(RHEL_RELEASE_CODE) && \ -+ (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7, 5)) && \ -+ (RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(8, 0))) -+#define ndo_change_mtu ndo_change_mtu_rh74 -+#endif -+ - #if ((LINUX_VERSION_CODE >= KERNEL_VERSION(4, 9, 0)) || \ - (SLE_VERSION_CODE && SLE_VERSION_CODE >= SLE_VERSION(12, 3, 0))) - #define HAVE_VF_VLAN_PROTO --- -1.8.3.1 - diff --git a/patch/dpdk-stable-17.11.6/0002-net-support-variable-IP-header-len-for-checksum-API.patch b/patch/dpdk-stable-17.11.6/0002-net-support-variable-IP-header-len-for-checksum-API.patch deleted file mode 100644 index 0a24083af..000000000 --- a/patch/dpdk-stable-17.11.6/0002-net-support-variable-IP-header-len-for-checksum-API.patch +++ /dev/null @@ -1,56 +0,0 @@ -From 4be86649fd61173409040093eecffdbc30570988 Mon Sep 17 00:00:00 2001 -From: ywc689 -Date: Fri, 28 Jun 2019 17:48:12 +0800 -Subject: [PATCH 2/3] net: support variable IP header len for checksum API. - -IPv4 checksum APIs use fixe IP header length, it will failed if there is -any IP option. Now calculating header length by "ihl" field, so that we -can support options. ---- - lib/librte_net/rte_ip.h | 13 +++++++------ - 1 file changed, 7 insertions(+), 6 deletions(-) - -diff --git a/lib/librte_net/rte_ip.h b/lib/librte_net/rte_ip.h -index 8d4907f..0d504f6 100644 ---- a/lib/librte_net/rte_ip.h -+++ b/lib/librte_net/rte_ip.h -@@ -314,7 +314,7 @@ struct ipv4_hdr { - rte_ipv4_cksum(const struct ipv4_hdr *ipv4_hdr) - { - uint16_t cksum; -- cksum = rte_raw_cksum(ipv4_hdr, sizeof(struct ipv4_hdr)); -+ cksum = rte_raw_cksum(ipv4_hdr, (ipv4_hdr->version_ihl & 0xf) * 4); - return (cksum == 0xffff) ? cksum : (uint16_t)~cksum; - } - -@@ -356,7 +356,7 @@ struct ipv4_hdr { - } else { - psd_hdr.len = rte_cpu_to_be_16( - (uint16_t)(rte_be_to_cpu_16(ipv4_hdr->total_length) -- - sizeof(struct ipv4_hdr))); -+ - (ipv4_hdr->version_ihl & 0xf) * 4)); - } - return rte_raw_cksum(&psd_hdr, sizeof(psd_hdr)); - } -@@ -379,13 +379,14 @@ struct ipv4_hdr { - rte_ipv4_udptcp_cksum(const struct ipv4_hdr *ipv4_hdr, const void *l4_hdr) - { - uint32_t cksum; -- uint32_t l3_len, l4_len; -+ uint32_t l3_len, l4_len, iphlen; - - l3_len = rte_be_to_cpu_16(ipv4_hdr->total_length); -- if (l3_len < sizeof(struct ipv4_hdr)) -- return 0; -+ iphlen = (ipv4_hdr->version_ihl & 0xf) * 4; - -- l4_len = l3_len - sizeof(struct ipv4_hdr); -+ if (l3_len < iphlen) -+ return 0; -+ l4_len = l3_len - iphlen; - - cksum = rte_raw_cksum(l4_hdr, l4_len); - cksum += rte_ipv4_phdr_cksum(ipv4_hdr, 0); --- -1.8.3.1 - diff --git a/patch/dpdk-stable-17.11.6/0003-pdump-enable-and-change-dpdk-pdump-tool-for-DPVS.patch b/patch/dpdk-stable-17.11.6/0003-pdump-enable-and-change-dpdk-pdump-tool-for-DPVS.patch deleted file mode 100644 index b7eebfdcd..000000000 --- a/patch/dpdk-stable-17.11.6/0003-pdump-enable-and-change-dpdk-pdump-tool-for-DPVS.patch +++ /dev/null @@ -1,920 +0,0 @@ -diff -uparN dpdk-stable-17.11.6/app/pdump/main.c dpdk-stable-17.11.6-new/app/pdump/main.c ---- dpdk-stable-17.11.6/app/pdump/main.c 2019-05-22 03:15:57.000000000 +0800 -+++ dpdk-stable-17.11.6-new/app/pdump/main.c 2020-08-24 11:09:32.166622729 +0800 -@@ -54,6 +54,7 @@ - #include - #include - #include -+#include - - #define CMD_LINE_OPT_PDUMP "pdump" - #define PDUMP_PORT_ARG "port" -@@ -65,6 +66,13 @@ - #define PDUMP_RING_SIZE_ARG "ring-size" - #define PDUMP_MSIZE_ARG "mbuf-size" - #define PDUMP_NUM_MBUFS_ARG "total-num-mbufs" -+#define PDUMP_HOST_ARG "host" -+#define PDUMP_SRC_ARG "src-host" -+#define PDUMP_DST_ARG "dst-host" -+#define PDUMP_PROTO_PORT_AGE "proto-port" -+#define PDUMP_SPORT_ARG "src-port" -+#define PDUMP_DPORT_ARG "dst-port" -+#define PDUMP_PROTO_ARG "proto" - #define CMD_LINE_OPT_SER_SOCK_PATH "server-socket-path" - #define CMD_LINE_OPT_CLI_SOCK_PATH "client-socket-path" - -@@ -120,6 +128,13 @@ const char *valid_pdump_arguments[] = { - PDUMP_RING_SIZE_ARG, - PDUMP_MSIZE_ARG, - PDUMP_NUM_MBUFS_ARG, -+ PDUMP_HOST_ARG, -+ PDUMP_SRC_ARG, -+ PDUMP_DST_ARG, -+ PDUMP_PROTO_PORT_AGE, -+ PDUMP_SPORT_ARG, -+ PDUMP_DPORT_ARG, -+ PDUMP_PROTO_ARG, - NULL - }; - -@@ -153,6 +168,7 @@ struct pdump_tuples { - enum pcap_stream rx_vdev_stream_type; - enum pcap_stream tx_vdev_stream_type; - bool single_pdump_dev; -+ struct pdump_filter *filter; - - /* stats */ - struct pdump_stats stats; -@@ -180,6 +196,11 @@ pdump_usage(const char *prgname) - "(queue=)," - "(rx-dev= |" - " tx-dev=," -+ "[host= | src-host= |" -+ "dst-host=]," -+ "[proto=support:tcp/udp/icmp]," -+ "[proto-port= |src-port= |" -+ "dst-port=]," - "[ring-size=default:16384]," - "[mbuf-size=default:2176]," - "[total-num-mbufs=default:65535]'\n" -@@ -270,6 +291,60 @@ parse_uint_value(const char *key, const - } - - static int -+parse_host(const char *key __rte_unused, const char *value, void *extra_args) { -+ struct pdump_tuples *pt =extra_args; -+ struct in_addr inaddr; -+ struct in6_addr inaddr6; -+ union addr addr; -+ int af = 0; -+ -+ if (inet_pton(AF_INET6, value, &inaddr6) > 0) { -+ af = AF_INET6; -+ addr.in6 = inaddr6; -+ } else if (inet_pton(AF_INET, value, &inaddr) > 0){ -+ af = AF_INET; -+ addr.in = inaddr; -+ } else { -+ printf("IP address invalid\n"); -+ return -EINVAL; -+ } -+ -+ if (pt->filter && pt->filter->af != 0 && af != pt->filter->af) { -+ printf("IPV4 and IPV6 conflict \n"); -+ return -EINVAL; -+ } else { -+ pt->filter->af = af; -+ } -+ -+ if (!strcmp(key, PDUMP_HOST_ARG)) { -+ rte_memcpy(&pt->filter->host_addr, &addr, sizeof(addr)); -+ } else if (!strcmp(key, PDUMP_SRC_ARG)){ -+ rte_memcpy(&pt->filter->s_addr, &addr, sizeof(addr)); -+ } else if (!strcmp(key, PDUMP_DST_ARG)){ -+ rte_memcpy(&pt->filter->d_addr, &addr, sizeof(addr)); -+ } -+ return 0; -+} -+ -+static int -+parse_proto(const char *key __rte_unused, const char *value, void *extra_args) { -+ struct pdump_tuples *pt =extra_args; -+ -+ if (!strcmp(value, "tcp")) { -+ pt->filter->proto = IPPROTO_TCP; -+ } else if (!strcmp(value, "udp")) { -+ pt->filter->proto = IPPROTO_UDP; -+ } else if (!strcmp(value, "icmp")) { -+ pt->filter->proto = IPPROTO_ICMP; -+ } else { -+ printf("invalid value:\"%s\" for key:\"%s\", " -+ "value must be tcp/udp/icmp\n", value, key); -+ return -EINVAL; -+ } -+ return 0; -+} -+ -+static int - parse_pdump(const char *optarg) - { - struct rte_kvargs *kvlist; -@@ -396,6 +471,73 @@ parse_pdump(const char *optarg) - } else - pt->total_num_mbufs = MBUFS_PER_POOL; - -+ /*filter parsing and validation*/ -+ pt->filter = rte_zmalloc("pdump_filter", -+ sizeof(struct pdump_filter), 0); -+ cnt1 = rte_kvargs_count(kvlist, PDUMP_HOST_ARG); -+ if (cnt1 == 1) { -+ ret = rte_kvargs_process(kvlist, PDUMP_HOST_ARG, -+ &parse_host, pt); -+ if (ret < 0) -+ goto free_kvlist; -+ } -+ -+ cnt1 = rte_kvargs_count(kvlist, PDUMP_SRC_ARG); -+ if (cnt1 == 1) { -+ ret = rte_kvargs_process(kvlist, PDUMP_SRC_ARG, -+ &parse_host, pt); -+ if (ret < 0) -+ goto free_kvlist; -+ } -+ -+ cnt1 = rte_kvargs_count(kvlist, PDUMP_DST_ARG); -+ if (cnt1 == 1) { -+ ret = rte_kvargs_process(kvlist, PDUMP_DST_ARG, -+ &parse_host, pt); -+ if (ret < 0) -+ goto free_kvlist; -+ } -+ -+ cnt1 = rte_kvargs_count(kvlist, PDUMP_PROTO_PORT_AGE); -+ if (cnt1 == 1) { -+ v.min = 1; -+ v.max = UINT16_MAX; -+ ret = rte_kvargs_process(kvlist, PDUMP_PROTO_PORT_AGE, -+ &parse_uint_value, &v); -+ if (ret < 0) -+ goto free_kvlist; -+ pt->filter->proto_port = (uint16_t) v.val; -+ } -+ -+ cnt1 = rte_kvargs_count(kvlist, PDUMP_SPORT_ARG); -+ if (cnt1 == 1) { -+ v.min = 1; -+ v.max = UINT16_MAX; -+ ret = rte_kvargs_process(kvlist, PDUMP_SPORT_ARG, -+ &parse_uint_value, &v); -+ if (ret < 0) -+ goto free_kvlist; -+ pt->filter->s_port = (uint16_t) v.val; -+ } -+ -+ cnt1 = rte_kvargs_count(kvlist, PDUMP_DPORT_ARG); -+ if (cnt1 == 1) { -+ v.min = 1; -+ v.max = UINT16_MAX; -+ ret = rte_kvargs_process(kvlist, PDUMP_DPORT_ARG, -+ &parse_uint_value, &v); -+ if (ret < 0) -+ goto free_kvlist; -+ pt->filter->d_port = (uint16_t) v.val; -+ } -+ cnt1 = rte_kvargs_count(kvlist, PDUMP_PROTO_ARG); -+ if (cnt1 == 1) { -+ ret = rte_kvargs_process(kvlist, PDUMP_PROTO_ARG, -+ &parse_proto, pt); -+ if (ret < 0) -+ goto free_kvlist; -+ } -+ - num_tuples++; - - free_kvlist: -@@ -540,6 +682,8 @@ cleanup_rings(void) - rte_ring_free(pt->rx_ring); - if (pt->tx_ring) - rte_ring_free(pt->tx_ring); -+ if (pt->filter) -+ rte_free(pt->filter); - } - } - -@@ -583,11 +727,10 @@ configure_vdev(uint16_t port_id) - { - struct ether_addr addr; - const uint16_t rxRings = 0, txRings = 1; -- const uint8_t nb_ports = rte_eth_dev_count(); - int ret; - uint16_t q; - -- if (port_id > nb_ports) -+ if (!rte_eth_dev_is_valid_port(port_id)) - return -1; - - ret = rte_eth_dev_configure(port_id, rxRings, txRings, -@@ -799,20 +942,20 @@ enable_pdump(void) - pt->queue, - RTE_PDUMP_FLAG_RX, - pt->rx_ring, -- pt->mp, NULL); -+ pt->mp, pt->filter); - ret1 = rte_pdump_enable_by_deviceid( - pt->device_id, - pt->queue, - RTE_PDUMP_FLAG_TX, - pt->tx_ring, -- pt->mp, NULL); -+ pt->mp, pt->filter); - } else if (pt->dump_by_type == PORT_ID) { - ret = rte_pdump_enable(pt->port, pt->queue, - RTE_PDUMP_FLAG_RX, -- pt->rx_ring, pt->mp, NULL); -+ pt->rx_ring, pt->mp, pt->filter); - ret1 = rte_pdump_enable(pt->port, pt->queue, - RTE_PDUMP_FLAG_TX, -- pt->tx_ring, pt->mp, NULL); -+ pt->tx_ring, pt->mp, pt->filter); - } - } else if (pt->dir == RTE_PDUMP_FLAG_RX) { - if (pt->dump_by_type == DEVICE_ID) -@@ -820,22 +963,22 @@ enable_pdump(void) - pt->device_id, - pt->queue, - pt->dir, pt->rx_ring, -- pt->mp, NULL); -+ pt->mp, pt->filter); - else if (pt->dump_by_type == PORT_ID) - ret = rte_pdump_enable(pt->port, pt->queue, - pt->dir, -- pt->rx_ring, pt->mp, NULL); -+ pt->rx_ring, pt->mp, pt->filter); - } else if (pt->dir == RTE_PDUMP_FLAG_TX) { - if (pt->dump_by_type == DEVICE_ID) - ret = rte_pdump_enable_by_deviceid( - pt->device_id, - pt->queue, - pt->dir, -- pt->tx_ring, pt->mp, NULL); -+ pt->tx_ring, pt->mp, pt->filter); - else if (pt->dump_by_type == PORT_ID) - ret = rte_pdump_enable(pt->port, pt->queue, - pt->dir, -- pt->tx_ring, pt->mp, NULL); -+ pt->tx_ring, pt->mp, pt->filter); - } - if (ret < 0 || ret1 < 0) { - cleanup_pdump_resources(); -diff -uparN dpdk-stable-17.11.6/app/pdump/Makefile dpdk-stable-17.11.6-new/app/pdump/Makefile ---- dpdk-stable-17.11.6/app/pdump/Makefile 2019-05-22 03:15:57.000000000 +0800 -+++ dpdk-stable-17.11.6-new/app/pdump/Makefile 2020-08-19 19:12:09.011111072 +0800 -@@ -41,6 +41,6 @@ CFLAGS += $(WERROR_FLAGS) - - SRCS-y := main.c - --include $(RTE_SDK)/mk/rte.app.mk -+include $(RTE_SDK)/mk/rte.pdump.mk - - endif -diff -uparN dpdk-stable-17.11.6/config/common_base dpdk-stable-17.11.6-new/config/common_base ---- dpdk-stable-17.11.6/config/common_base 2019-05-22 03:15:57.000000000 +0800 -+++ dpdk-stable-17.11.6-new/config/common_base 2020-08-19 19:12:09.011111072 +0800 -@@ -399,7 +399,7 @@ CONFIG_RTE_PMD_RING_MAX_TX_RINGS=16 - # - # Compile software PMD backed by PCAP files - # --CONFIG_RTE_LIBRTE_PMD_PCAP=n -+CONFIG_RTE_LIBRTE_PMD_PCAP=y - - # - # Compile link bonding PMD library -diff -uparN dpdk-stable-17.11.6/lib/librte_pdump/rte_pdump.c dpdk-stable-17.11.6-new/lib/librte_pdump/rte_pdump.c ---- dpdk-stable-17.11.6/lib/librte_pdump/rte_pdump.c 2019-05-22 03:15:57.000000000 +0800 -+++ dpdk-stable-17.11.6-new/lib/librte_pdump/rte_pdump.c 2020-08-24 12:03:10.436176887 +0800 -@@ -46,6 +46,10 @@ - #include - #include - #include -+#include -+#include -+#include -+#include - - #include "rte_pdump.h" - -@@ -177,6 +181,132 @@ pdump_pktmbuf_copy(struct rte_mbuf *m, s - return m_dup; - } - -+static bool -+inet_addr_equal(int af, const union addr *a1, const union addr *a2) -+{ -+ switch (af) { -+ case AF_INET: -+ return a1->in.s_addr == a2->in.s_addr; -+ case AF_INET6: -+ return memcmp(a1->in6.s6_addr, a2->in6.s6_addr, 16) == 0; -+ default: -+ return memcmp(a1, a2, sizeof(union addr)) == 0; -+ } -+} -+ -+static bool -+inet_is_addr_any(int af, const union addr *addr) -+{ -+ switch (af) { -+ case AF_INET: -+ return addr->in.s_addr == htonl(INADDR_ANY); -+ case AF_INET6: -+ return IN6_ARE_ADDR_EQUAL(&addr->in6, &in6addr_any); -+ default: -+ return false; -+ } -+ -+} -+ -+/* support vlan/arp/ipv4ipv6 */ -+static int -+pdump_filter(struct rte_mbuf *m, struct pdump_filter *filter) -+{ -+ struct ether_hdr *eth_hdr; -+ struct vlan_eth_hdr *vlan_eth_hdr; -+ union addr s_addr, d_addr; -+ int prepend = 0; -+ uint16_t type = 0; -+ uint16_t iph_len = 0; -+ uint8_t proto = 0; -+ -+ int af; -+ -+ if (filter->af == 0 && filter->s_port == 0 && -+ filter->d_port == 0 && filter->proto == 0) -+ return 0; -+ -+ eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); -+ -+ if (eth_hdr->ether_type == htons(ETH_P_8021Q)) { -+ prepend += sizeof(struct vlan_eth_hdr); -+ vlan_eth_hdr = rte_pktmbuf_mtod(m, struct vlan_eth_hdr *); -+ type = vlan_eth_hdr->h_vlan_encapsulated_proto; -+ } else { -+ prepend += sizeof(struct ether_hdr); -+ eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); -+ type = eth_hdr->ether_type; -+ } -+ -+ if (rte_pktmbuf_adj(m, prepend) == NULL) -+ goto prepend; -+ -+ if (type == rte_cpu_to_be_16(ETHER_TYPE_ARP)) { -+ struct arp_hdr *arp = rte_pktmbuf_mtod(m, struct arp_hdr *); -+ af = AF_INET; -+ s_addr.in.s_addr = arp->arp_data.arp_sip; -+ d_addr.in.s_addr = arp->arp_data.arp_tip; -+ //proto = IPPROTO_ICMP; -+ } else if (type == rte_cpu_to_be_16(ETHER_TYPE_IPv4)) { -+ struct ipv4_hdr *ip4 = rte_pktmbuf_mtod(m, struct ipv4_hdr *); -+ af = AF_INET; -+ s_addr.in.s_addr = ip4->src_addr; -+ d_addr.in.s_addr = ip4->dst_addr; -+ proto = ip4->next_proto_id; -+ iph_len = (ip4->version_ihl & 0xf) << 2; -+ } else if (type == rte_cpu_to_be_16(ETHER_TYPE_IPv6)) { -+ struct ipv6_hdr *ip6 = rte_pktmbuf_mtod(m, struct ipv6_hdr *); -+ af = AF_INET6; -+ rte_memcpy(&s_addr.in6, &ip6->src_addr, 16); -+ rte_memcpy(&d_addr.in6, &ip6->dst_addr, 16); -+ proto = ip6->proto; -+ iph_len = sizeof(struct ipv6_hdr); -+ } else { -+ goto prepend; -+ } -+ -+ /* filter */ -+ if (!inet_is_addr_any(af, &filter->s_addr) && -+ !inet_addr_equal(af, &filter->s_addr, &s_addr)) -+ goto prepend; -+ if (!inet_is_addr_any(af, &filter->d_addr) && -+ !inet_addr_equal(af, &filter->d_addr, &d_addr)) -+ goto prepend; -+ if (!inet_is_addr_any(af, &filter->host_addr) && -+ !inet_addr_equal(af, &filter->host_addr, &s_addr) && -+ !inet_addr_equal(af, &filter->host_addr, &d_addr)) -+ goto prepend; -+ -+ if (filter->proto && filter->proto != proto) -+ goto prepend; -+ -+ if (filter->s_port || filter->d_port) { -+ if (proto != IPPROTO_TCP && proto != IPPROTO_UDP) -+ goto prepend; -+ struct udp_hdr _uh; -+ const struct udp_hdr *uh; -+ uh = rte_pktmbuf_read(m, iph_len, sizeof(_uh), &_uh); -+ if (uh == NULL) -+ goto prepend; -+ if (filter->s_port && filter->s_port != rte_cpu_to_be_16(uh->src_port)) -+ goto prepend; -+ if (filter->d_port && filter->d_port != rte_cpu_to_be_16(uh->dst_port)) -+ goto prepend; -+ -+ if (filter->proto_port && -+ filter->proto_port != rte_cpu_to_be_16(uh->src_port) && -+ filter->proto_port != rte_cpu_to_be_16(uh->dst_port)) -+ goto prepend; -+ } -+ -+ rte_pktmbuf_prepend(m, prepend); -+ return 0; -+ -+prepend: -+ rte_pktmbuf_prepend(m, prepend); -+ return -1; -+} -+ - static inline void - pdump_copy(struct rte_mbuf **pkts, uint16_t nb_pkts, void *user_params) - { -@@ -193,6 +323,8 @@ pdump_copy(struct rte_mbuf **pkts, uint1 - ring = cbs->ring; - mp = cbs->mp; - for (i = 0; i < nb_pkts; i++) { -+ if (pdump_filter(pkts[i], cbs->filter) != 0) -+ continue; - p = pdump_pktmbuf_copy(pkts[i], mp); - if (p) - dup_bufs[d_pkts++] = p; -@@ -229,7 +361,7 @@ pdump_tx(uint16_t port __rte_unused, uin - static int - pdump_register_rx_callbacks(uint16_t end_q, uint16_t port, uint16_t queue, - struct rte_ring *ring, struct rte_mempool *mp, -- uint16_t operation) -+ struct pdump_filter *filter, uint16_t operation) - { - uint16_t qid; - struct pdump_rxtx_cbs *cbs = NULL; -@@ -247,6 +379,7 @@ pdump_register_rx_callbacks(uint16_t end - } - cbs->ring = ring; - cbs->mp = mp; -+ cbs->filter = filter; - cbs->cb = rte_eth_add_first_rx_callback(port, qid, - pdump_rx, cbs); - if (cbs->cb == NULL) { -@@ -283,7 +416,7 @@ pdump_register_rx_callbacks(uint16_t end - static int - pdump_register_tx_callbacks(uint16_t end_q, uint16_t port, uint16_t queue, - struct rte_ring *ring, struct rte_mempool *mp, -- uint16_t operation) -+ struct pdump_filter *filter, uint16_t operation) - { - - uint16_t qid; -@@ -302,6 +435,7 @@ pdump_register_tx_callbacks(uint16_t end - } - cbs->ring = ring; - cbs->mp = mp; -+ cbs->filter = filter; - cbs->cb = rte_eth_add_tx_callback(port, qid, pdump_tx, - cbs); - if (cbs->cb == NULL) { -@@ -345,6 +479,7 @@ set_pdump_rxtx_cbs(struct pdump_request - uint16_t operation; - struct rte_ring *ring; - struct rte_mempool *mp; -+ struct pdump_filter *filter; - - flags = p->flags; - operation = p->op; -@@ -360,6 +495,7 @@ set_pdump_rxtx_cbs(struct pdump_request - queue = p->data.en_v1.queue; - ring = p->data.en_v1.ring; - mp = p->data.en_v1.mp; -+ filter = p->data.en_v1.filter; - } else { - ret = rte_eth_dev_get_port_by_name(p->data.dis_v1.device, - &port); -@@ -372,6 +508,7 @@ set_pdump_rxtx_cbs(struct pdump_request - queue = p->data.dis_v1.queue; - ring = p->data.dis_v1.ring; - mp = p->data.dis_v1.mp; -+ filter = p->data.dis_v1.filter; - } - - /* validation if packet capture is for all queues */ -@@ -403,7 +540,7 @@ set_pdump_rxtx_cbs(struct pdump_request - if (flags & RTE_PDUMP_FLAG_RX) { - end_q = (queue == RTE_PDUMP_ALL_QUEUES) ? nb_rx_q : queue + 1; - ret = pdump_register_rx_callbacks(end_q, port, queue, ring, mp, -- operation); -+ filter, operation); - if (ret < 0) - return ret; - } -@@ -412,7 +549,7 @@ set_pdump_rxtx_cbs(struct pdump_request - if (flags & RTE_PDUMP_FLAG_TX) { - end_q = (queue == RTE_PDUMP_ALL_QUEUES) ? nb_tx_q : queue + 1; - ret = pdump_register_tx_callbacks(end_q, port, queue, ring, mp, -- operation); -+ filter, operation); - if (ret < 0) - return ret; - } -diff -uparN dpdk-stable-17.11.6/lib/librte_pdump/rte_pdump.h dpdk-stable-17.11.6-new/lib/librte_pdump/rte_pdump.h ---- dpdk-stable-17.11.6/lib/librte_pdump/rte_pdump.h 2019-05-22 03:15:57.000000000 +0800 -+++ dpdk-stable-17.11.6-new/lib/librte_pdump/rte_pdump.h 2020-08-19 19:12:09.012111071 +0800 -@@ -44,6 +44,8 @@ - #include - #include - #include -+#include -+#include - - #ifdef __cplusplus - extern "C" { -@@ -63,6 +65,31 @@ enum rte_pdump_socktype { - RTE_PDUMP_SOCKET_CLIENT = 2 - }; - -+union addr { -+ struct in_addr in; -+ struct in6_addr in6; -+}; -+ -+struct pdump_filter { -+ int af; -+ union addr s_addr; //s_addr or dst_addr -+ union addr d_addr; //s_addr or dst_addr -+ union addr host_addr; //s_addr or dst_addr -+ -+ uint8_t proto; -+ uint16_t proto_port; -+ uint16_t s_port; -+ uint16_t d_port; -+}; -+ -+struct vlan_eth_hdr { -+ unsigned char h_dest[ETH_ALEN]; -+ unsigned char h_source[ETH_ALEN]; -+ unsigned short h_vlan_proto; -+ unsigned short h_vlan_TCI; -+ unsigned short h_vlan_encapsulated_proto; -+}; -+ - /** - * Initialize packet capturing handling - * -diff -uparN dpdk-stable-17.11.6/mk/rte.pdump.mk dpdk-stable-17.11.6-new/mk/rte.pdump.mk ---- dpdk-stable-17.11.6/mk/rte.pdump.mk 1970-01-01 08:00:00.000000000 +0800 -+++ dpdk-stable-17.11.6-new/mk/rte.pdump.mk 2020-08-19 19:12:09.012111071 +0800 -@@ -0,0 +1,349 @@ -+# BSD LICENSE -+# -+# Copyright(c) 2010-2017 Intel Corporation. All rights reserved. -+# Copyright(c) 2014-2015 6WIND S.A. -+# All rights reserved. -+# -+# Redistribution and use in source and binary forms, with or without -+# modification, are permitted provided that the following conditions -+# are met: -+# -+# * Redistributions of source code must retain the above copyright -+# notice, this list of conditions and the following disclaimer. -+# * Redistributions in binary form must reproduce the above copyright -+# notice, this list of conditions and the following disclaimer in -+# the documentation and/or other materials provided with the -+# distribution. -+# * Neither the name of Intel Corporation nor the names of its -+# contributors may be used to endorse or promote products derived -+# from this software without specific prior written permission. -+# -+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+ -+include $(RTE_SDK)/mk/internal/rte.compile-pre.mk -+include $(RTE_SDK)/mk/internal/rte.install-pre.mk -+include $(RTE_SDK)/mk/internal/rte.clean-pre.mk -+include $(RTE_SDK)/mk/internal/rte.build-pre.mk -+ -+# VPATH contains at least SRCDIR -+VPATH += $(SRCDIR) -+ -+_BUILD = $(APP) -+_INSTALL = $(INSTALL-FILES-y) $(SYMLINK-FILES-y) -+_INSTALL += $(RTE_OUTPUT)/app/$(APP) $(RTE_OUTPUT)/app/$(APP).map -+POSTINSTALL += target-appinstall -+_CLEAN = doclean -+POSTCLEAN += target-appclean -+ -+ifeq ($(NO_LDSCRIPT),) -+LDSCRIPT = $(RTE_LDSCRIPT) -+endif -+ -+# Link only the libraries used in the application -+LDFLAGS += --as-needed -+ -+# default path for libs -+_LDLIBS-y += -L$(RTE_SDK_BIN)/lib -+ -+# -+# Order is important: from higher level to lower level -+# -+_LDLIBS-$(CONFIG_RTE_LIBRTE_FLOW_CLASSIFY) += -lrte_flow_classify -+_LDLIBS-$(CONFIG_RTE_LIBRTE_PIPELINE) += -lrte_pipeline -+_LDLIBS-$(CONFIG_RTE_LIBRTE_TABLE) += -lrte_table -+_LDLIBS-$(CONFIG_RTE_LIBRTE_PORT) += -lrte_port -+ -+_LDLIBS-$(CONFIG_RTE_LIBRTE_PDUMP) += -lrte_pdump -+_LDLIBS-$(CONFIG_RTE_LIBRTE_DISTRIBUTOR) += -lrte_distributor -+_LDLIBS-$(CONFIG_RTE_LIBRTE_IP_FRAG) += -lrte_ip_frag -+_LDLIBS-$(CONFIG_RTE_LIBRTE_GRO) += -lrte_gro -+_LDLIBS-$(CONFIG_RTE_LIBRTE_GSO) += -lrte_gso -+_LDLIBS-$(CONFIG_RTE_LIBRTE_METER) += -lrte_meter -+_LDLIBS-$(CONFIG_RTE_LIBRTE_LPM) += -lrte_lpm -+# librte_acl needs --whole-archive because of weak functions -+_LDLIBS-$(CONFIG_RTE_LIBRTE_ACL) += --whole-archive -+_LDLIBS-$(CONFIG_RTE_LIBRTE_ACL) += --no-whole-archive -+_LDLIBS-$(CONFIG_RTE_LIBRTE_JOBSTATS) += -lrte_jobstats -+_LDLIBS-$(CONFIG_RTE_LIBRTE_METRICS) += -lrte_metrics -+_LDLIBS-$(CONFIG_RTE_LIBRTE_BITRATE) += -lrte_bitratestats -+_LDLIBS-$(CONFIG_RTE_LIBRTE_LATENCY_STATS) += -lrte_latencystats -+_LDLIBS-$(CONFIG_RTE_LIBRTE_POWER) += -lrte_power -+ -+_LDLIBS-$(CONFIG_RTE_LIBRTE_TIMER) += -lrte_timer -+_LDLIBS-$(CONFIG_RTE_LIBRTE_EFD) += -lrte_efd -+ -+_LDLIBS-y += --whole-archive -+ -+_LDLIBS-$(CONFIG_RTE_LIBRTE_CFGFILE) += -lrte_cfgfile -+_LDLIBS-$(CONFIG_RTE_LIBRTE_HASH) += -lrte_hash -+_LDLIBS-$(CONFIG_RTE_LIBRTE_VHOST) += -lrte_vhost -+_LDLIBS-$(CONFIG_RTE_LIBRTE_KVARGS) += -lrte_kvargs -+_LDLIBS-$(CONFIG_RTE_LIBRTE_MBUF) += -lrte_mbuf -+_LDLIBS-$(CONFIG_RTE_LIBRTE_NET) += -lrte_net -+_LDLIBS-$(CONFIG_RTE_LIBRTE_ETHER) += -lrte_ethdev -+_LDLIBS-$(CONFIG_RTE_LIBRTE_CRYPTODEV) += -lrte_cryptodev -+_LDLIBS-$(CONFIG_RTE_LIBRTE_SECURITY) += -lrte_security -+_LDLIBS-$(CONFIG_RTE_LIBRTE_MEMPOOL) += -lrte_mempool -+_LDLIBS-$(CONFIG_RTE_DRIVER_MEMPOOL_RING) += -lrte_mempool_ring -+_LDLIBS-$(CONFIG_RTE_LIBRTE_RING) += -lrte_ring -+_LDLIBS-$(CONFIG_RTE_LIBRTE_PCI) += -lrte_pci -+_LDLIBS-$(CONFIG_RTE_LIBRTE_EAL) += -lrte_eal -+_LDLIBS-$(CONFIG_RTE_LIBRTE_CMDLINE) += -lrte_cmdline -+_LDLIBS-$(CONFIG_RTE_LIBRTE_SCHED) += -lrte_sched -+ -+ifeq ($(CONFIG_RTE_EXEC_ENV_LINUXAPP),y) -+_LDLIBS-$(CONFIG_RTE_LIBRTE_KNI) += -lrte_kni -+endif -+ -+_LDLIBS-$(CONFIG_RTE_LIBRTE_PCI_BUS) += -lrte_bus_pci -+_LDLIBS-$(CONFIG_RTE_LIBRTE_VDEV_BUS) += -lrte_bus_vdev -+ -+ifeq ($(CONFIG_RTE_BUILD_SHARED_LIB),n) -+# plugins (link only if static libraries) -+ -+_LDLIBS-$(CONFIG_RTE_DRIVER_MEMPOOL_STACK) += -lrte_mempool_stack -+ -+_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_AF_PACKET) += -lrte_pmd_af_packet -+_LDLIBS-$(CONFIG_RTE_LIBRTE_ARK_PMD) += -lrte_pmd_ark -+_LDLIBS-$(CONFIG_RTE_LIBRTE_AVP_PMD) += -lrte_pmd_avp -+_LDLIBS-$(CONFIG_RTE_LIBRTE_BNX2X_PMD) += -lrte_pmd_bnx2x -lz -+_LDLIBS-$(CONFIG_RTE_LIBRTE_BNXT_PMD) += -lrte_pmd_bnxt -+_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_BOND) += -lrte_pmd_bond -+_LDLIBS-$(CONFIG_RTE_LIBRTE_CXGBE_PMD) += -lrte_pmd_cxgbe -+ifeq ($(CONFIG_RTE_LIBRTE_DPAA_BUS),y) -+_LDLIBS-$(CONFIG_RTE_LIBRTE_DPAA_BUS) += -lrte_bus_dpaa -+_LDLIBS-$(CONFIG_RTE_LIBRTE_DPAA_MEMPOOL) += -lrte_mempool_dpaa -+_LDLIBS-$(CONFIG_RTE_LIBRTE_DPAA_PMD) += -lrte_pmd_dpaa -+endif -+_LDLIBS-$(CONFIG_RTE_LIBRTE_DPAA2_PMD) += -lrte_pmd_dpaa2 -+_LDLIBS-$(CONFIG_RTE_LIBRTE_E1000_PMD) += -lrte_pmd_e1000 -+_LDLIBS-$(CONFIG_RTE_LIBRTE_ENA_PMD) += -lrte_pmd_ena -+_LDLIBS-$(CONFIG_RTE_LIBRTE_ENIC_PMD) += -lrte_pmd_enic -+_LDLIBS-$(CONFIG_RTE_LIBRTE_FM10K_PMD) += -lrte_pmd_fm10k -+_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_FAILSAFE) += -lrte_pmd_failsafe -+_LDLIBS-$(CONFIG_RTE_LIBRTE_I40E_PMD) += -lrte_pmd_i40e -+_LDLIBS-$(CONFIG_RTE_LIBRTE_IXGBE_PMD) += -lrte_pmd_ixgbe -+ifeq ($(CONFIG_RTE_LIBRTE_KNI),y) -+_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_KNI) += -lrte_pmd_kni -+endif -+_LDLIBS-$(CONFIG_RTE_LIBRTE_LIO_PMD) += -lrte_pmd_lio -+_LDLIBS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += -lrte_pmd_mlx4 -libverbs -lmlx4 -+_LDLIBS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += -lrte_pmd_mlx5 -libverbs -lmlx5 -+_LDLIBS-$(CONFIG_RTE_LIBRTE_MRVL_PMD) += -lrte_pmd_mrvl -L$(LIBMUSDK_PATH)/lib -lmusdk -+_LDLIBS-$(CONFIG_RTE_LIBRTE_NFP_PMD) += -lrte_pmd_nfp -+_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_NULL) += -lrte_pmd_null -+_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_PCAP) += -lrte_pmd_pcap -lpcap -+_LDLIBS-$(CONFIG_RTE_LIBRTE_QEDE_PMD) += -lrte_pmd_qede -+_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_RING) += -lrte_pmd_ring -+ifeq ($(CONFIG_RTE_LIBRTE_SCHED),y) -+_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_SOFTNIC) += -lrte_pmd_softnic -+endif -+_LDLIBS-$(CONFIG_RTE_LIBRTE_SFC_EFX_PMD) += -lrte_pmd_sfc_efx -+_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_SZEDATA2) += -lrte_pmd_szedata2 -lsze2 -+_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += -lrte_pmd_tap -+_LDLIBS-$(CONFIG_RTE_LIBRTE_THUNDERX_NICVF_PMD) += -lrte_pmd_thunderx_nicvf -+_LDLIBS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += -lrte_pmd_virtio -+ifeq ($(CONFIG_RTE_LIBRTE_VHOST),y) -+_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_VHOST) += -lrte_pmd_vhost -+endif # $(CONFIG_RTE_LIBRTE_VHOST) -+_LDLIBS-$(CONFIG_RTE_LIBRTE_VMXNET3_PMD) += -lrte_pmd_vmxnet3_uio -+ -+ifeq ($(CONFIG_RTE_LIBRTE_CRYPTODEV),y) -+_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_AESNI_MB) += -lrte_pmd_aesni_mb -+_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_AESNI_MB) += -L$(AESNI_MULTI_BUFFER_LIB_PATH) -lIPSec_MB -+_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_AESNI_GCM) += -lrte_pmd_aesni_gcm -+_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_AESNI_GCM) += -L$(AESNI_MULTI_BUFFER_LIB_PATH) -lIPSec_MB -+_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_OPENSSL) += -lrte_pmd_openssl -lcrypto -+_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_NULL_CRYPTO) += -lrte_pmd_null_crypto -+_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_QAT) += -lrte_pmd_qat -lcrypto -+_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_SNOW3G) += -lrte_pmd_snow3g -+_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_SNOW3G) += -L$(LIBSSO_SNOW3G_PATH)/build -lsso_snow3g -+_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_KASUMI) += -lrte_pmd_kasumi -+_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_KASUMI) += -L$(LIBSSO_KASUMI_PATH)/build -lsso_kasumi -+_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_ZUC) += -lrte_pmd_zuc -+_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_ZUC) += -L$(LIBSSO_ZUC_PATH)/build -lsso_zuc -+_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_ARMV8_CRYPTO) += -lrte_pmd_armv8 -+_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_ARMV8_CRYPTO) += -L$(ARMV8_CRYPTO_LIB_PATH) -larmv8_crypto -+_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_MRVL_CRYPTO) += -L$(LIBMUSDK_PATH)/lib -lrte_pmd_mrvl_crypto -lmusdk -+_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_CRYPTO_SCHEDULER) += -lrte_pmd_crypto_scheduler -+ifeq ($(CONFIG_RTE_LIBRTE_FSLMC_BUS),y) -+_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_DPAA2_SEC) += -lrte_pmd_dpaa2_sec -+_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_DPAA2_SEC) += -lrte_mempool_dpaa2 -+_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_DPAA2_SEC) += -lrte_bus_fslmc -+endif # CONFIG_RTE_LIBRTE_FSLMC_BUS -+ -+ifeq ($(CONFIG_RTE_LIBRTE_DPAA_BUS),y) -+_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_DPAA_SEC) += -lrte_bus_dpaa -+_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_DPAA_SEC) += -lrte_pmd_dpaa_sec -+endif # CONFIG_RTE_LIBRTE_DPAA_BUS -+ -+endif # CONFIG_RTE_LIBRTE_CRYPTODEV -+ -+ifeq ($(CONFIG_RTE_LIBRTE_DPAA2_PMD),y) -+_LDLIBS-$(CONFIG_RTE_LIBRTE_DPAA2_PMD) += -lrte_bus_fslmc -+_LDLIBS-$(CONFIG_RTE_LIBRTE_DPAA2_PMD) += -lrte_mempool_dpaa2 -+endif # CONFIG_RTE_LIBRTE_DPAA2_PMD -+ -+endif # !CONFIG_RTE_BUILD_SHARED_LIBS -+ -+_LDLIBS-y += --no-whole-archive -+ -+ifeq ($(CONFIG_RTE_BUILD_SHARED_LIB),n) -+# The static libraries do not know their dependencies. -+# So linking with static library requires explicit dependencies. -+_LDLIBS-$(CONFIG_RTE_LIBRTE_EAL) += -lrt -+ifeq ($(CONFIG_RTE_EXEC_ENV_LINUXAPP)$(CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES),yy) -+_LDLIBS-$(CONFIG_RTE_LIBRTE_EAL) += -lnuma -+endif -+_LDLIBS-$(CONFIG_RTE_LIBRTE_SCHED) += -lm -+_LDLIBS-$(CONFIG_RTE_LIBRTE_SCHED) += -lrt -+_LDLIBS-$(CONFIG_RTE_LIBRTE_MEMBER) += -lm -+_LDLIBS-$(CONFIG_RTE_LIBRTE_METER) += -lm -+ifeq ($(CONFIG_RTE_LIBRTE_VHOST_NUMA),y) -+_LDLIBS-$(CONFIG_RTE_LIBRTE_VHOST) += -lnuma -+endif -+_LDLIBS-$(CONFIG_RTE_PORT_PCAP) += -lpcap -+endif # !CONFIG_RTE_BUILD_SHARED_LIBS -+ -+_LDLIBS-y += $(EXECENV_LDLIBS) -+ -+LDLIBS += $(_LDLIBS-y) $(CPU_LDLIBS) $(EXTRA_LDLIBS) -+ -+# all the words except the first one -+allbutfirst = $(wordlist 2,$(words $(1)),$(1)) -+ -+# Eliminate duplicates without sorting, only keep the last occurrence -+filter-libs = \ -+ $(if $(1),$(strip\ -+ $(if \ -+ $(and \ -+ $(filter $(firstword $(1)),$(call allbutfirst,$(1))),\ -+ $(filter -l%,$(firstword $(1)))),\ -+ ,\ -+ $(firstword $(1))) \ -+ $(call filter-libs,$(call allbutfirst,$(1))))) -+ -+LDLIBS := $(call filter-libs,$(LDLIBS)) -+ -+ifeq ($(RTE_DEVEL_BUILD)$(CONFIG_RTE_BUILD_SHARED_LIB),yy) -+LDFLAGS += -rpath=$(RTE_SDK_BIN)/lib -+endif -+ -+MAPFLAGS = -Map=$@.map --cref -+ -+.PHONY: all -+all: install -+ -+.PHONY: install -+install: build _postinstall -+ -+_postinstall: build -+ -+.PHONY: build -+build: _postbuild -+ -+exe2cmd = $(strip $(call dotfile,$(patsubst %,%.cmd,$(1)))) -+ -+ifeq ($(LINK_USING_CC),1) -+O_TO_EXE = $(CC) -o $@ $(CFLAGS) $(EXTRA_CFLAGS) $(OBJS-y) $(call linkerprefix, \ -+ $(LDLIBS) $(LDFLAGS) $(LDFLAGS_$(@)) $(EXTRA_LDFLAGS) \ -+ $(MAPFLAGS)) -+else -+O_TO_EXE = $(LD) -o $@ $(OBJS-y) \ -+ $(LDLIBS) $(LDFLAGS) $(LDFLAGS_$(@)) $(EXTRA_LDFLAGS) \ -+ $(MAPFLAGS) -+endif -+O_TO_EXE_STR = $(subst ','\'',$(O_TO_EXE)) #'# fix syntax highlight -+O_TO_EXE_DISP = $(if $(V),"$(O_TO_EXE_STR)"," LD $(@)") -+O_TO_EXE_CMD = "cmd_$@ = $(O_TO_EXE_STR)" -+O_TO_EXE_DO = @set -e; \ -+ echo $(O_TO_EXE_DISP); \ -+ $(O_TO_EXE) && \ -+ echo $(O_TO_EXE_CMD) > $(call exe2cmd,$(@)) -+ -+-include .$(APP).cmd -+ -+# path where libraries are retrieved -+LDLIBS_PATH := $(subst -Wl$(comma)-L,,$(filter -Wl$(comma)-L%,$(LDLIBS))) -+LDLIBS_PATH += $(subst -L,,$(filter -L%,$(LDLIBS))) -+ -+# list of .a files that are linked to this application -+LDLIBS_NAMES := $(patsubst -l%,lib%.a,$(filter -l%,$(LDLIBS))) -+LDLIBS_NAMES += $(patsubst -Wl$(comma)-l%,lib%.a,$(filter -Wl$(comma)-l%,$(LDLIBS))) -+ -+# list of found libraries files (useful for deps). If not found, the -+# library is silently ignored and dep won't be checked -+LDLIBS_FILES := $(sort $(wildcard $(foreach dir,$(LDLIBS_PATH),\ -+ $(addprefix $(dir)/,$(LDLIBS_NAMES))))) -+ -+# -+# Compile executable file if needed -+# -+$(APP): $(OBJS-y) $(LDLIBS_FILES) $(DEP_$(APP)) $(LDSCRIPT) FORCE -+ @[ -d $(dir $@) ] || mkdir -p $(dir $@) -+ $(if $(D),\ -+ @echo -n "$< -> $@ " ; \ -+ echo -n "file_missing=$(call boolean,$(file_missing)) " ; \ -+ echo -n "cmdline_changed=$(call boolean,$(call cmdline_changed,$(O_TO_EXE_STR))) " ; \ -+ echo -n "depfile_missing=$(call boolean,$(depfile_missing)) " ; \ -+ echo "depfile_newer=$(call boolean,$(depfile_newer)) ") -+ $(if $(or \ -+ $(file_missing),\ -+ $(call cmdline_changed,$(O_TO_EXE_STR)),\ -+ $(depfile_missing),\ -+ $(depfile_newer)),\ -+ $(O_TO_EXE_DO)) -+ -+# -+# install app in $(RTE_OUTPUT)/app -+# -+$(RTE_OUTPUT)/app/$(APP): $(APP) -+ @echo " INSTALL-APP $(APP)" -+ @[ -d $(RTE_OUTPUT)/app ] || mkdir -p $(RTE_OUTPUT)/app -+ $(Q)cp -f $(APP) $(RTE_OUTPUT)/app -+ -+# -+# install app map file in $(RTE_OUTPUT)/app -+# -+$(RTE_OUTPUT)/app/$(APP).map: $(APP) -+ @echo " INSTALL-MAP $(APP).map" -+ @[ -d $(RTE_OUTPUT)/app ] || mkdir -p $(RTE_OUTPUT)/app -+ $(Q)cp -f $(APP).map $(RTE_OUTPUT)/app -+ -+# -+# Clean all generated files -+# -+.PHONY: clean -+clean: _postclean -+ $(Q)rm -f $(_BUILD_TARGETS) $(_INSTALL_TARGETS) $(_CLEAN_TARGETS) -+ -+.PHONY: doclean -+doclean: -+ $(Q)rm -rf $(APP) $(OBJS-all) $(DEPS-all) $(DEPSTMP-all) \ -+ $(CMDS-all) $(INSTALL-FILES-all) .$(APP).cmd $(APP).map -+ -+ -+include $(RTE_SDK)/mk/internal/rte.compile-post.mk -+include $(RTE_SDK)/mk/internal/rte.install-post.mk -+include $(RTE_SDK)/mk/internal/rte.clean-post.mk -+include $(RTE_SDK)/mk/internal/rte.build-post.mk -+ -+ifneq ($(wildcard $(RTE_SDK)/mk/target/$(RTE_TARGET)/rte.app.mk),) -+include $(RTE_SDK)/mk/target/$(RTE_TARGET)/rte.app.mk -+else -+include $(RTE_SDK)/mk/target/generic/rte.app.mk -+endif -+ -+.PHONY: FORCE -+FORCE: -+ diff --git a/patch/dpdk-stable-17.11.6/enable-dpdk-eal-memory-debug.patch b/patch/dpdk-stable-17.11.6/enable-dpdk-eal-memory-debug.patch deleted file mode 100644 index 96c4940b8..000000000 --- a/patch/dpdk-stable-17.11.6/enable-dpdk-eal-memory-debug.patch +++ /dev/null @@ -1,68 +0,0 @@ -From e21a4e12c4e8dd60a68041a7b52f07e9e68053ff Mon Sep 17 00:00:00 2001 -From: wencyu -Date: Mon, 6 Jan 2020 20:20:17 +0800 -Subject: [PATCH] debug: enable dpdk eal memory debug - ---- - config/common_base | 2 +- - lib/librte_eal/common/include/rte_malloc.h | 15 +++++++++++++++ - lib/librte_eal/common/rte_malloc.c | 4 ++++ - 3 files changed, 20 insertions(+), 1 deletion(-) - -diff --git a/config/common_base b/config/common_base -index 31f50b4..4cb1957 100644 ---- a/config/common_base -+++ b/config/common_base -@@ -103,7 +103,7 @@ CONFIG_RTE_EAL_ALLOW_INV_SOCKET_ID=n - CONFIG_RTE_EAL_ALWAYS_PANIC_ON_ERROR=n - CONFIG_RTE_EAL_IGB_UIO=n - CONFIG_RTE_EAL_VFIO=n --CONFIG_RTE_MALLOC_DEBUG=n -+CONFIG_RTE_MALLOC_DEBUG=y - CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES=n - CONFIG_RTE_USE_LIBBSD=n - -diff --git a/lib/librte_eal/common/include/rte_malloc.h b/lib/librte_eal/common/include/rte_malloc.h -index 5d4c11a..0d3833e 100644 ---- a/lib/librte_eal/common/include/rte_malloc.h -+++ b/lib/librte_eal/common/include/rte_malloc.h -@@ -242,6 +242,21 @@ struct rte_malloc_socket_stats { - rte_calloc_socket(const char *type, size_t num, size_t size, unsigned align, int socket); - - /** -+ * Check the header/tailer cookies of memory pointed to by the provided pointer. -+ * -+ * This pointer must have been returned by a previous call to -+ * rte_malloc(), rte_zmalloc(), rte_calloc() or rte_realloc(). -+ * -+ * @param ptr -+ * The pointer to memory to be checked. -+ * @return -+ * - true if the header/tailer cookies are OK. -+ * - Otherwise, false. -+ */ -+int -+rte_memmory_ok(void *ptr); -+ -+/** - * Frees the memory space pointed to by the provided pointer. - * - * This pointer must have been returned by a previous call to -diff --git a/lib/librte_eal/common/rte_malloc.c b/lib/librte_eal/common/rte_malloc.c -index fe2278b..81d22f0 100644 ---- a/lib/librte_eal/common/rte_malloc.c -+++ b/lib/librte_eal/common/rte_malloc.c -@@ -53,6 +53,10 @@ - #include "malloc_elem.h" - #include "malloc_heap.h" - -+int rte_memmory_ok(void *addr) -+{ -+ return malloc_elem_cookies_ok(RTE_PTR_SUB(addr, MALLOC_ELEM_HEADER_LEN)); -+} - - /* Free the memory space back to heap */ - void rte_free(void *addr) --- -1.8.3.1 - diff --git a/patch/dpdk-stable-18.11.2/0001-kni-use-netlink-event-for-multicast-driver-part.patch b/patch/dpdk-stable-18.11.2/0001-kni-use-netlink-event-for-multicast-driver-part.patch deleted file mode 100644 index 17aea758a..000000000 --- a/patch/dpdk-stable-18.11.2/0001-kni-use-netlink-event-for-multicast-driver-part.patch +++ /dev/null @@ -1,124 +0,0 @@ -From 659c6e84e3ae0c5e5b93894aa15dd4983b3ac6c3 Mon Sep 17 00:00:00 2001 -From: ywc689 -Date: Fri, 28 Jun 2019 16:52:24 +0800 -Subject: [PATCH 1/3] kni: use netlink event for multicast (driver part) - -kni driver send netlink event every time hw-multicast list updated by -kernel, the user kni app should capture the event and update multicast -to kni device. - -original way is using rte_kni_request to pass hw-multicast to user kni -module. that method works but finally memory corruption found, which is -to kni device. ---- - kernel/linux/kni/kni_net.c | 70 ++++++++++++++++++++++++++++++++++++++++++++++ - 1 file changed, 70 insertions(+) - -diff --git a/kernel/linux/kni/kni_net.c b/kernel/linux/kni/kni_net.c -index 7371b6d..edc1416 100644 ---- a/kernel/linux/kni/kni_net.c -+++ b/kernel/linux/kni/kni_net.c -@@ -16,6 +16,8 @@ - #include - #include - #include -+#include -+#include - - #include - #include -@@ -103,6 +105,7 @@ - ret_val = wait_event_interruptible_timeout(kni->wq, - kni_fifo_count(kni->resp_q), 3 * HZ); - if (signal_pending(current) || ret_val <= 0) { -+ pr_err("%s: wait_event_interruptible timeout\n", __func__); - ret = -ETIME; - goto fail; - } -@@ -605,9 +608,75 @@ void kni_net_release_fifo_phy(struct kni_dev *kni) - return -EOPNOTSUPP; - } - -+static size_t -+kni_nlmsg_size(void) -+{ -+ return NLMSG_ALIGN(sizeof(struct ifaddrmsg)) -+ + nla_total_size(4) /* IFA_ADDRESS */ -+ + nla_total_size(4) /* IFA_LOCAL */ -+ + nla_total_size(4) /* IFA_BROADCAST */ -+ + nla_total_size(IFNAMSIZ) /* IFA_LABEL */ -+ + nla_total_size(4) /* IFA_FLAGS */ -+ + nla_total_size(sizeof(struct ifa_cacheinfo)); /* IFA_CACHEINFO */ -+} -+ - static void - kni_net_set_rx_mode(struct net_device *dev) - { -+ /* -+ * send event to notify user (DPDK KNI app) that multicast list changed, -+ * so that it can monitor multicast join/leave and set HW mc-addrs to -+ * kni dev accordinglly. -+ * -+ * this event is just an notification, we do not save any mc-addr here -+ * (so attribute space for us). user kni app should get maddrs after -+ * receive this notification. -+ * -+ * I was expecting kernel send some rtnl event for multicast join/leave, -+ * but it doesn't. By checking the call-chain of SIOCADDMULTI (ip maddr, -+ * manages only hardware multicast) and IP_ADD_MEMBERSHIP (ip_mc_join_group, -+ * used to for IPv4 multicast), no rtnl event sent. -+ * -+ * so as workaround, modify kni driver here to send RTM_NEWADDR. -+ * it may not suitalbe to use this event for mcast, but that should works. -+ * hope that won't affect other listener to this event. -+ * -+ * previous solution was using rte_kni_request to pass hw-maddr list to user. -+ * it "works" for times but finally memory corruption found, which is -+ * not easy to address (lock was added and reviewed). That's why we use -+ * netlink event instead. -+ */ -+ struct sk_buff *skb; -+ struct net *net = dev_net(dev); -+ struct nlmsghdr *nlh; -+ struct ifaddrmsg *ifm; -+ -+ skb = nlmsg_new(kni_nlmsg_size(), GFP_ATOMIC); -+ if (!skb) -+ return; -+ -+ /* no other event for us ? */ -+ nlh = nlmsg_put(skb, 0, 0, RTM_NEWADDR, sizeof(*ifm), 0); -+ if (!nlh) { -+ kfree_skb(skb); -+ return; -+ } -+ -+ /* just send an notification so no other info */ -+ ifm = nlmsg_data(nlh); -+ memset(ifm, 0, sizeof(*ifm)); -+ ifm->ifa_family = AF_UNSPEC; -+ ifm->ifa_prefixlen = 0; -+ ifm->ifa_flags = 0; -+ ifm->ifa_scope = RT_SCOPE_NOWHERE; -+ ifm->ifa_index = 0; -+ -+ nlmsg_end(skb, nlh); -+ -+ /* other group ? */ -+ pr_debug("%s: rx-mode/multicast-list changed\n", __func__); -+ rtnl_notify(skb, net, 0, RTNLGRP_NOTIFY, NULL, GFP_ATOMIC); -+ return; - } - - static int -@@ -727,6 +796,7 @@ void kni_net_release_fifo_phy(struct kni_dev *kni) - kni = netdev_priv(netdev); - ret = kni_net_process_request(kni, &req); - -+ pr_info("%s request returns %d!\n", __func__, ret); - return (ret == 0 ? req.result : ret); - } - --- -1.8.3.1 - diff --git a/patch/dpdk-stable-18.11.2/0002-net-support-variable-IP-header-len-for-checksum-API.patch b/patch/dpdk-stable-18.11.2/0002-net-support-variable-IP-header-len-for-checksum-API.patch deleted file mode 100644 index 2356e2154..000000000 --- a/patch/dpdk-stable-18.11.2/0002-net-support-variable-IP-header-len-for-checksum-API.patch +++ /dev/null @@ -1,56 +0,0 @@ -From 86d8695113517403c59497dc2f43a333fa44316b Mon Sep 17 00:00:00 2001 -From: ywc689 -Date: Fri, 28 Jun 2019 16:27:08 +0800 -Subject: [PATCH 2/3] net: support variable IP header len for checksum API. - -IPv4 checksum APIs use fixe IP header length, it will failed if there is -any IP option. Now calculating header length by "ihl" field, so that we -can support options. ---- - lib/librte_net/rte_ip.h | 13 +++++++------ - 1 file changed, 7 insertions(+), 6 deletions(-) - -diff --git a/lib/librte_net/rte_ip.h b/lib/librte_net/rte_ip.h -index f9b9090..635bdcc 100644 ---- a/lib/librte_net/rte_ip.h -+++ b/lib/librte_net/rte_ip.h -@@ -252,7 +252,7 @@ struct ipv4_hdr { - rte_ipv4_cksum(const struct ipv4_hdr *ipv4_hdr) - { - uint16_t cksum; -- cksum = rte_raw_cksum(ipv4_hdr, sizeof(struct ipv4_hdr)); -+ cksum = rte_raw_cksum(ipv4_hdr, (ipv4_hdr->version_ihl & 0xf) * 4); - return (cksum == 0xffff) ? cksum : (uint16_t)~cksum; - } - -@@ -294,7 +294,7 @@ struct ipv4_hdr { - } else { - psd_hdr.len = rte_cpu_to_be_16( - (uint16_t)(rte_be_to_cpu_16(ipv4_hdr->total_length) -- - sizeof(struct ipv4_hdr))); -+ - (ipv4_hdr->version_ihl & 0xf) * 4)); - } - return rte_raw_cksum(&psd_hdr, sizeof(psd_hdr)); - } -@@ -317,13 +317,14 @@ struct ipv4_hdr { - rte_ipv4_udptcp_cksum(const struct ipv4_hdr *ipv4_hdr, const void *l4_hdr) - { - uint32_t cksum; -- uint32_t l3_len, l4_len; -+ uint32_t l3_len, l4_len, iphlen; - - l3_len = rte_be_to_cpu_16(ipv4_hdr->total_length); -- if (l3_len < sizeof(struct ipv4_hdr)) -- return 0; -+ iphlen = (ipv4_hdr->version_ihl & 0xf) * 4; - -- l4_len = l3_len - sizeof(struct ipv4_hdr); -+ if (l3_len < iphlen) -+ return 0; -+ l4_len = l3_len - iphlen; - - cksum = rte_raw_cksum(l4_hdr, l4_len); - cksum += rte_ipv4_phdr_cksum(ipv4_hdr, 0); --- -1.8.3.1 - diff --git a/patch/dpdk-stable-18.11.2/0003-driver-kni-enable-flow_item-type-comparsion-in-flow_.patch b/patch/dpdk-stable-18.11.2/0003-driver-kni-enable-flow_item-type-comparsion-in-flow_.patch deleted file mode 100644 index d5634fb53..000000000 --- a/patch/dpdk-stable-18.11.2/0003-driver-kni-enable-flow_item-type-comparsion-in-flow_.patch +++ /dev/null @@ -1,33 +0,0 @@ -From 2e26428dc4d450e974ceb9bc737f691057cd80b6 Mon Sep 17 00:00:00 2001 -From: ywc689 -Date: Fri, 28 Jun 2019 17:02:40 +0800 -Subject: [PATCH 3/3] driver:kni: enable flow_item type comparsion in - flow_fdir_cmp - -the existence is checked before adding/deleting a fdir flow, but -the flow type is not compared in 'flow_fdir_cmp', which resulting -in the failure or unwanted behavior in adding/deleting two same -fdir flows with flow type(such as ipv4 tcp/udp) different only. ---- - drivers/net/mlx5/mlx5_flow.c | 5 +++++ - 1 file changed, 5 insertions(+) - -diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c -index 222cd81..d99edce 100644 ---- a/drivers/net/mlx5/mlx5_flow.c -+++ b/drivers/net/mlx5/mlx5_flow.c -@@ -2668,6 +2668,11 @@ struct rte_flow * - static int - flow_fdir_cmp(const struct mlx5_fdir *f1, const struct mlx5_fdir *f2) - { -+ unsigned i; -+ for (i = 0; i < sizeof(f1->items)/sizeof(f1->items[0]); i++) { -+ if (f1->items[i].type != f2->items[i].type) -+ return 1; -+ } - if (FLOW_FDIR_CMP(f1, f2, attr) || - FLOW_FDIR_CMP(f1, f2, l2) || - FLOW_FDIR_CMP(f1, f2, l2_mask) || --- -1.8.3.1 - diff --git a/patch/dpdk-stable-18.11.2/0004-rm-rte_experimental-attribute-of-rte_memseg_walk.patch b/patch/dpdk-stable-18.11.2/0004-rm-rte_experimental-attribute-of-rte_memseg_walk.patch deleted file mode 100644 index 65b69b387..000000000 --- a/patch/dpdk-stable-18.11.2/0004-rm-rte_experimental-attribute-of-rte_memseg_walk.patch +++ /dev/null @@ -1,42 +0,0 @@ -From f90a349979926bb547583dc6f1b6b1f1ab3b3189 Mon Sep 17 00:00:00 2001 -From: liuchuanqi -Date: Fri, 7 Aug 2020 19:20:57 +0800 -Subject: [PATCH] rm rte_experimental attribute of rte_memseg_walk - -there is no __rte_experimental attribute in function rte_mempool_walk and rte_memzone_walk of dpdk 18.11, -and there is no __rte_experimental attribute in function rte_memseg_walk of the higher version's dpdk(eg: dpdk 20.05). -so remove it to prevent compilation error when dpdk application calls the function. ---- - lib/librte_eal/common/eal_common_memory.c | 2 +- - lib/librte_eal/common/include/rte_memory.h | 2 +- - 2 files changed, 2 insertions(+), 2 deletions(-) - -diff --git a/lib/librte_eal/common/eal_common_memory.c b/lib/librte_eal/common/eal_common_memory.c -index d47ea4938..3d8ce67f0 100644 ---- a/lib/librte_eal/common/eal_common_memory.c -+++ b/lib/librte_eal/common/eal_common_memory.c -@@ -601,7 +601,7 @@ rte_memseg_contig_walk(rte_memseg_contig_walk_t func, void *arg) - return ret; - } - --int __rte_experimental -+int - rte_memseg_walk_thread_unsafe(rte_memseg_walk_t func, void *arg) - { - struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; -diff --git a/lib/librte_eal/common/include/rte_memory.h b/lib/librte_eal/common/include/rte_memory.h -index d970825df..71bee8b6b 100644 ---- a/lib/librte_eal/common/include/rte_memory.h -+++ b/lib/librte_eal/common/include/rte_memory.h -@@ -227,7 +227,7 @@ typedef int (*rte_memseg_list_walk_t)(const struct rte_memseg_list *msl, - * 1 if stopped by the user - * -1 if user function reported error - */ --int __rte_experimental -+int - rte_memseg_walk(rte_memseg_walk_t func, void *arg); - - /** --- -2.21.1 (Apple Git-122.3) - diff --git a/patch/dpdk-stable-18.11.2/0006-enable-dpdk-eal-memory-debug.patch b/patch/dpdk-stable-18.11.2/0006-enable-dpdk-eal-memory-debug.patch deleted file mode 100644 index c286c15f3..000000000 --- a/patch/dpdk-stable-18.11.2/0006-enable-dpdk-eal-memory-debug.patch +++ /dev/null @@ -1,68 +0,0 @@ -From 19652889ed74b09aba6f22dfa96b19c009a7309a Mon Sep 17 00:00:00 2001 -From: ywc -Date: Mon, 25 Jan 2021 10:27:52 +0800 -Subject: [PATCH] enable dpdk eal memory debug - ---- - config/common_base | 2 +- - lib/librte_eal/common/include/rte_malloc.h | 15 +++++++++++++++ - lib/librte_eal/common/rte_malloc.c | 4 ++++ - 3 files changed, 20 insertions(+), 1 deletion(-) - -diff --git a/config/common_base b/config/common_base -index d12ae98..765ae2e 100644 ---- a/config/common_base -+++ b/config/common_base -@@ -94,7 +94,7 @@ CONFIG_RTE_EAL_IGB_UIO=n - CONFIG_RTE_EAL_VFIO=n - CONFIG_RTE_MAX_VFIO_GROUPS=64 - CONFIG_RTE_MAX_VFIO_CONTAINERS=64 --CONFIG_RTE_MALLOC_DEBUG=n -+CONFIG_RTE_MALLOC_DEBUG=y - CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES=n - CONFIG_RTE_USE_LIBBSD=n - -diff --git a/lib/librte_eal/common/include/rte_malloc.h b/lib/librte_eal/common/include/rte_malloc.h -index e0be13c..f3bcdc6 100644 ---- a/lib/librte_eal/common/include/rte_malloc.h -+++ b/lib/librte_eal/common/include/rte_malloc.h -@@ -214,6 +214,21 @@ struct rte_malloc_socket_stats { - rte_calloc_socket(const char *type, size_t num, size_t size, unsigned align, int socket); - - /** -+ * Check the header/tailer cookies of memory pointed to by the provided pointer. -+ * -+ * This pointer must have been returned by a previous call to -+ * rte_malloc(), rte_zmalloc(), rte_calloc() or rte_realloc(). -+ * -+ * @param ptr -+ * The pointer to memory to be checked. -+ * @return -+ * - true if the header/tailer cookies are OK. -+ * - Otherwise, false. -+ */ -+int -+rte_memmory_ok(void *ptr); -+ -+/** - * Frees the memory space pointed to by the provided pointer. - * - * This pointer must have been returned by a previous call to -diff --git a/lib/librte_eal/common/rte_malloc.c b/lib/librte_eal/common/rte_malloc.c -index 47c2bec..1fab27c 100644 ---- a/lib/librte_eal/common/rte_malloc.c -+++ b/lib/librte_eal/common/rte_malloc.c -@@ -26,6 +26,10 @@ - #include "malloc_heap.h" - #include "eal_memalloc.h" - -+int rte_memmory_ok(void *addr) -+{ -+ return malloc_elem_cookies_ok(RTE_PTR_SUB(addr, MALLOC_ELEM_HEADER_LEN)); -+} - - /* Free the memory space back to heap */ - void rte_free(void *addr) --- -1.8.3.1 - diff --git a/patch/dpdk-stable-17.11.6/0001-kni-use-netlink-event-for-multicast-driver-part.patch b/patch/dpdk-stable-20.11.1/0001-kni-use-netlink-event-for-multicast-driver-part.patch similarity index 50% rename from patch/dpdk-stable-17.11.6/0001-kni-use-netlink-event-for-multicast-driver-part.patch rename to patch/dpdk-stable-20.11.1/0001-kni-use-netlink-event-for-multicast-driver-part.patch index 0934c174a..e39254c24 100644 --- a/patch/dpdk-stable-17.11.6/0001-kni-use-netlink-event-for-multicast-driver-part.patch +++ b/patch/dpdk-stable-20.11.1/0001-kni-use-netlink-event-for-multicast-driver-part.patch @@ -1,51 +1,59 @@ -From b5dc636f0ccdccb3d4e94f3453b6e95a631bb10a Mon Sep 17 00:00:00 2001 -From: ywc689 -Date: Fri, 28 Jun 2019 17:52:13 +0800 -Subject: [PATCH 1/3] kni: use netlink event for multicast (driver part) +From 5b032cc0d59f9fe2e9607423a92399254e30a8f7 Mon Sep 17 00:00:00 2001 +From: huangyichen +Date: Thu, 1 Jul 2021 21:21:16 +0800 +Subject: [PATCH 1/6] kni: use netlink event for multicast (driver part) -kni driver send netlink event every time hw-multicast list updated by +Kni driver sends netlink event every time hw-multicast list updated by kernel, the user kni app should capture the event and update multicast to kni device. -original way is using rte_kni_request to pass hw-multicast to user kni -module. that method works but finally memory corruption found, which is -to kni device. +Original way is using rte_kni_request to pass hw-multicast to user kni +module. That method works but finally memory corruption found, which is +not easy to address. That's why we use netlink event instead. --- - lib/librte_eal/linuxapp/kni/kni_net.c | 68 +++++++++++++++++++++++++++++++++++ - 1 file changed, 68 insertions(+) + kernel/linux/kni/kni_net.c | 76 ++++++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 76 insertions(+) -diff --git a/lib/librte_eal/linuxapp/kni/kni_net.c b/lib/librte_eal/linuxapp/kni/kni_net.c -index db9f489..fab94d1 100644 ---- a/lib/librte_eal/linuxapp/kni/kni_net.c -+++ b/lib/librte_eal/linuxapp/kni/kni_net.c -@@ -35,6 +35,8 @@ +diff --git a/kernel/linux/kni/kni_net.c b/kernel/linux/kni/kni_net.c +index 4b75208..cde565e 100644 +--- a/kernel/linux/kni/kni_net.c ++++ b/kernel/linux/kni/kni_net.c +@@ -17,6 +17,8 @@ #include #include #include +#include +#include - #include + #include #include -@@ -579,9 +581,75 @@ - return 0; +@@ -128,6 +130,7 @@ kni_net_process_request(struct kni_dev *kni, struct rte_kni_request *req) + ret_val = wait_event_interruptible_timeout(kni->wq, + kni_fifo_count(kni->resp_q), 3 * HZ); + if (signal_pending(current) || ret_val <= 0) { ++ pr_err("%s: wait_event_interruptible timeout\n", __func__); + ret = -ETIME; + goto fail; + } +@@ -657,6 +660,77 @@ kni_net_change_mtu(struct net_device *dev, int new_mtu) + return (ret == 0) ? req.result : ret; } +static size_t +kni_nlmsg_size(void) +{ + return NLMSG_ALIGN(sizeof(struct ifaddrmsg)) -+ + nla_total_size(4) /* IFA_ADDRESS */ -+ + nla_total_size(4) /* IFA_LOCAL */ -+ + nla_total_size(4) /* IFA_BROADCAST */ -+ + nla_total_size(IFNAMSIZ) /* IFA_LABEL */ -+ + nla_total_size(4) /* IFA_FLAGS */ -+ + nla_total_size(sizeof(struct ifa_cacheinfo)); /* IFA_CACHEINFO */ ++ + nla_total_size(4) /* IFA_ADDRESS */ ++ + nla_total_size(4) /* IFA_LOCAL */ ++ + nla_total_size(4) /* IFA_BROADCAST */ ++ + nla_total_size(IFNAMSIZ) /* IFA_LABEL */ ++ + nla_total_size(4) /* IFA_FLAGS */ ++ + nla_total_size(sizeof(struct ifa_cacheinfo)); /* IFA_CACHEINFO */ +} + - static void - kni_net_set_rx_mode(struct net_device *dev) - { ++static void ++kni_net_set_rx_mode(struct net_device *dev) ++{ + /* + * send event to notify user (DPDK KNI app) that multicast list changed, + * so that it can monitor multicast join/leave and set HW mc-addrs to @@ -100,9 +108,27 @@ index db9f489..fab94d1 100644 + pr_debug("%s: rx-mode/multicast-list changed\n", __func__); + rtnl_notify(skb, net, 0, RTNLGRP_NOTIFY, NULL, GFP_ATOMIC); + return; ++} ++ + static void + kni_net_change_rx_flags(struct net_device *netdev, int flags) + { +@@ -757,6 +831,7 @@ kni_net_set_mac(struct net_device *netdev, void *p) + kni = netdev_priv(netdev); + ret = kni_net_process_request(kni, &req); + ++ pr_info("%s request returns %d!\n", __func__, ret); + return (ret == 0 ? req.result : ret); } - static int +@@ -788,6 +863,7 @@ static const struct net_device_ops kni_net_netdev_ops = { + .ndo_change_rx_flags = kni_net_change_rx_flags, + .ndo_start_xmit = kni_net_tx, + .ndo_change_mtu = kni_net_change_mtu, ++ .ndo_set_rx_mode = kni_net_set_rx_mode, + .ndo_tx_timeout = kni_net_tx_timeout, + .ndo_set_mac_address = kni_net_set_mac, + #ifdef HAVE_CHANGE_CARRIER_CB -- 1.8.3.1 diff --git a/patch/dpdk-stable-18.11.2/0005-enable-pdump-and-change-dpdk-pdump-tool-for-dpvs.patch b/patch/dpdk-stable-20.11.1/0002-pdump-change-dpdk-pdump-tool-for-dpvs.patch similarity index 51% rename from patch/dpdk-stable-18.11.2/0005-enable-pdump-and-change-dpdk-pdump-tool-for-dpvs.patch rename to patch/dpdk-stable-20.11.1/0002-pdump-change-dpdk-pdump-tool-for-dpvs.patch index d2a68efbc..89d3f4c47 100644 --- a/patch/dpdk-stable-18.11.2/0005-enable-pdump-and-change-dpdk-pdump-tool-for-dpvs.patch +++ b/patch/dpdk-stable-20.11.1/0002-pdump-change-dpdk-pdump-tool-for-dpvs.patch @@ -1,17 +1,27 @@ +From 8d1dc22740a315d62596445beba8b8737c45ffa4 Mon Sep 17 00:00:00 2001 +From: huangyichen +Date: Thu, 1 Jul 2021 21:23:50 +0800 +Subject: [PATCH 2/6] pdump: change dpdk-pdump tool for dpvs + +--- + app/pdump/main.c | 167 ++++++++++++++++++++++++++++++++++++++++--- + lib/librte_pdump/rte_pdump.c | 145 +++++++++++++++++++++++++++++++++++-- + lib/librte_pdump/rte_pdump.h | 27 +++++++ + 3 files changed, 327 insertions(+), 12 deletions(-) + diff --git a/app/pdump/main.c b/app/pdump/main.c -index ccf2a1d..7e72c16 100644 +index b34bf33..9d14474 100644 --- a/app/pdump/main.c +++ b/app/pdump/main.c -@@ -26,6 +26,8 @@ +@@ -27,6 +27,7 @@ #include #include #include +#include -+ #define CMD_LINE_OPT_PDUMP "pdump" - #define PDUMP_PORT_ARG "port" -@@ -38,6 +40,14 @@ + #define CMD_LINE_OPT_PDUMP_NUM 256 +@@ -42,6 +43,14 @@ #define PDUMP_MSIZE_ARG "mbuf-size" #define PDUMP_NUM_MBUFS_ARG "total-num-mbufs" @@ -26,7 +36,7 @@ index ccf2a1d..7e72c16 100644 #define VDEV_NAME_FMT "net_pcap_%s_%d" #define VDEV_PCAP_ARGS_FMT "tx_pcap=%s" #define VDEV_IFACE_ARGS_FMT "tx_iface=%s" -@@ -91,6 +101,13 @@ enum pdump_by { +@@ -97,6 +106,13 @@ static const char * const valid_pdump_arguments[] = { PDUMP_RING_SIZE_ARG, PDUMP_MSIZE_ARG, PDUMP_NUM_MBUFS_ARG, @@ -40,15 +50,15 @@ index ccf2a1d..7e72c16 100644 NULL }; -@@ -124,6 +141,7 @@ struct pdump_tuples { +@@ -130,6 +146,7 @@ struct pdump_tuples { enum pcap_stream rx_vdev_stream_type; enum pcap_stream tx_vdev_stream_type; bool single_pdump_dev; -+ struct pdump_filter *filter; ++ struct pdump_filter *filter; /* stats */ struct pdump_stats stats; -@@ -149,6 +167,11 @@ struct parse_val { +@@ -158,6 +175,11 @@ pdump_usage(const char *prgname) "(queue=)," "(rx-dev= |" " tx-dev=," @@ -60,149 +70,148 @@ index ccf2a1d..7e72c16 100644 "[ring-size=default:16384]," "[mbuf-size=default:2176]," "[total-num-mbufs=default:65535]'\n", -@@ -235,6 +258,65 @@ struct parse_val { +@@ -244,6 +266,64 @@ parse_uint_value(const char *key, const char *value, void *extra_args) } static int +parse_host(const char *key __rte_unused, const char *value, void *extra_args) +{ -+ struct pdump_tuples *pt = extra_args; -+ struct in_addr inaddr; -+ struct in6_addr inaddr6; -+ union addr addr; -+ int af = 0; -+ -+ if (inet_pton(AF_INET6, value, &inaddr6) > 0) { -+ af = AF_INET6; -+ addr.in6 = inaddr6; -+ } else if (inet_pton(AF_INET, value, &inaddr) > 0){ -+ af = AF_INET; -+ addr.in = inaddr; -+ } else { -+ printf("IP address invaled\n"); -+ return -EINVAL; -+ } -+ -+ if (pt->filter && pt->filter->af != 0 && af != pt->filter->af) { -+ printf("IPv4 and IPv6 conflict\n"); -+ return -EINVAL; -+ } else { -+ pt->filter->af = af; -+ } -+ -+ if (!strcmp(key, PDUMP_HOST_ARG)) { -+ rte_memcpy(&pt->filter->host_addr, &addr, sizeof(addr)); -+ } else if (!strcmp(key, PDUMP_SRC_ARG)) { -+ rte_memcpy(&pt->filter->s_addr, &addr, sizeof(addr)); -+ } else if (!strcmp(key, PDUMP_DST_ARG)) { -+ rte_memcpy(&pt->filter->d_addr, &addr, sizeof(addr)); -+ } -+ -+ return 0; ++ struct pdump_tuples *pt = extra_args; ++ struct in_addr inaddr; ++ struct in6_addr inaddr6; ++ union addr addr; ++ int af = 0; ++ ++ if (inet_pton(AF_INET6, value, &inaddr6) > 0) { ++ af = AF_INET6; ++ addr.in6 = inaddr6; ++ } else if (inet_pton(AF_INET, value, &inaddr) > 0){ ++ af = AF_INET; ++ addr.in = inaddr; ++ } else { ++ printf("IP address invaled\n"); ++ return -EINVAL; ++ } ++ ++ if (pt->filter && pt->filter->af != 0 && af != pt->filter->af) { ++ printf("IPv4 and IPv6 conflict\n"); ++ return -EINVAL; ++ } else { ++ pt->filter->af = af; ++ } ++ ++ if (!strcmp(key, PDUMP_HOST_ARG)) { ++ rte_memcpy(&pt->filter->host_addr, &addr, sizeof(addr)); ++ } else if (!strcmp(key, PDUMP_SRC_ARG)) { ++ rte_memcpy(&pt->filter->s_addr, &addr, sizeof(addr)); ++ } else if (!strcmp(key, PDUMP_DST_ARG)) { ++ rte_memcpy(&pt->filter->d_addr, &addr, sizeof(addr)); ++ } ++ ++ return 0; +} + +static int +parse_proto(const char *key __rte_unused, const char *value, void *extra_args) +{ -+ struct pdump_tuples *pt = extra_args; -+ -+ if (!strcmp(value, "tcp")) { -+ pt->filter->proto = IPPROTO_TCP; -+ } else if (!strcmp(value, "udp")) { -+ pt->filter->proto = IPPROTO_UDP; -+ } else if (!strcmp(value, "icmp")) { -+ pt->filter->proto = IPPROTO_ICMP; -+ } else { -+ printf("invalid value:\"%s\" for key:\"%s\", " -+ "value must be tcp/udp/icmp\n", value, key); -+ return -EINVAL; -+ } -+ -+ return 0; -+} ++ struct pdump_tuples *pt = extra_args; ++ ++ if (!strcmp(value, "tcp")) { ++ pt->filter->proto = IPPROTO_TCP; ++ } else if (!strcmp(value, "udp")) { ++ pt->filter->proto = IPPROTO_UDP; ++ } else if (!strcmp(value, "icmp")) { ++ pt->filter->proto = IPPROTO_ICMP; ++ } else { ++ printf("invalid value:\"%s\" for key:\"%s\", " ++ "value must be tcp/udp/icmp\n", value, key); ++ return -EINVAL; ++ } + ++ return 0; ++} + +static int parse_pdump(const char *optarg) { struct rte_kvargs *kvlist; -@@ -361,6 +443,75 @@ struct parse_val { +@@ -370,6 +450,75 @@ parse_pdump(const char *optarg) } else pt->total_num_mbufs = MBUFS_PER_POOL; -+ /* filter parsing and validation */ -+ pt->filter = rte_zmalloc("pdump_filter", -+ sizeof(struct pdump_filter), 0); -+ cnt1 = rte_kvargs_count(kvlist, PDUMP_HOST_ARG); -+ if (cnt1 == 1) { -+ ret = rte_kvargs_process(kvlist, PDUMP_HOST_ARG, -+ &parse_host, pt); -+ if (ret < 0) -+ goto free_kvlist; -+ } -+ -+ cnt1 = rte_kvargs_count(kvlist, PDUMP_SRC_ARG); -+ if (cnt1 == 1) { -+ ret = rte_kvargs_process(kvlist, PDUMP_SRC_ARG, -+ &parse_host, pt); -+ if (ret < 0) -+ goto free_kvlist; -+ } -+ -+ cnt1 = rte_kvargs_count(kvlist, PDUMP_DST_ARG); -+ if (cnt1 == 1) { -+ ret = rte_kvargs_process(kvlist, PDUMP_DST_ARG, -+ &parse_host, pt); -+ if (ret < 0) -+ goto free_kvlist; -+ } -+ -+ -+ cnt1 = rte_kvargs_count(kvlist, PDUMP_PROTO_PORT_AGE); -+ if (cnt1 == 1) { -+ v.min = 1; -+ v.max = UINT16_MAX; -+ ret = rte_kvargs_process(kvlist, PDUMP_PROTO_PORT_AGE, -+ &parse_uint_value, &v); -+ if (ret < 0) -+ goto free_kvlist; -+ pt->filter->proto_port = (uint16_t) v.val; -+ } -+ -+ cnt1 = rte_kvargs_count(kvlist, PDUMP_SPORT_ARG); -+ if (cnt1 == 1) { -+ v.min = 1; -+ v.max = UINT16_MAX; -+ ret = rte_kvargs_process(kvlist, PDUMP_SPORT_ARG, -+ &parse_uint_value, &v); -+ if (ret < 0) -+ goto free_kvlist; -+ pt->filter->s_port = (uint16_t) v.val; -+ } -+ -+ cnt1 = rte_kvargs_count(kvlist, PDUMP_DPORT_ARG); -+ if (cnt1 == 1) { -+ v.min = 1; -+ v.max = UINT16_MAX; -+ ret = rte_kvargs_process(kvlist, PDUMP_DPORT_ARG, -+ &parse_uint_value, &v); -+ if (ret < 0) -+ goto free_kvlist; -+ pt->filter->d_port = (uint16_t) v.val; -+ } -+ -+ cnt1 = rte_kvargs_count(kvlist, PDUMP_PROTO_ARG); -+ if (cnt1 == 1) { -+ ret = rte_kvargs_process(kvlist, PDUMP_PROTO_ARG, -+ &parse_proto, pt); -+ if (ret < 0) -+ goto free_kvlist; -+ } ++ /* filter parsing and validation */ ++ pt->filter = rte_zmalloc("pdump_filter", ++ sizeof(struct pdump_filter), 0); ++ cnt1 = rte_kvargs_count(kvlist, PDUMP_HOST_ARG); ++ if (cnt1 == 1) { ++ ret = rte_kvargs_process(kvlist, PDUMP_HOST_ARG, ++ &parse_host, pt); ++ if (ret < 0) ++ goto free_kvlist; ++ } ++ ++ cnt1 = rte_kvargs_count(kvlist, PDUMP_SRC_ARG); ++ if (cnt1 == 1) { ++ ret = rte_kvargs_process(kvlist, PDUMP_SRC_ARG, ++ &parse_host, pt); ++ if (ret < 0) ++ goto free_kvlist; ++ } ++ ++ cnt1 = rte_kvargs_count(kvlist, PDUMP_DST_ARG); ++ if (cnt1 == 1) { ++ ret = rte_kvargs_process(kvlist, PDUMP_DST_ARG, ++ &parse_host, pt); ++ if (ret < 0) ++ goto free_kvlist; ++ } ++ ++ ++ cnt1 = rte_kvargs_count(kvlist, PDUMP_PROTO_PORT_AGE); ++ if (cnt1 == 1) { ++ v.min = 1; ++ v.max = UINT16_MAX; ++ ret = rte_kvargs_process(kvlist, PDUMP_PROTO_PORT_AGE, ++ &parse_uint_value, &v); ++ if (ret < 0) ++ goto free_kvlist; ++ pt->filter->proto_port = (uint16_t) v.val; ++ } ++ ++ cnt1 = rte_kvargs_count(kvlist, PDUMP_SPORT_ARG); ++ if (cnt1 == 1) { ++ v.min = 1; ++ v.max = UINT16_MAX; ++ ret = rte_kvargs_process(kvlist, PDUMP_SPORT_ARG, ++ &parse_uint_value, &v); ++ if (ret < 0) ++ goto free_kvlist; ++ pt->filter->s_port = (uint16_t) v.val; ++ } ++ ++ cnt1 = rte_kvargs_count(kvlist, PDUMP_DPORT_ARG); ++ if (cnt1 == 1) { ++ v.min = 1; ++ v.max = UINT16_MAX; ++ ret = rte_kvargs_process(kvlist, PDUMP_DPORT_ARG, ++ &parse_uint_value, &v); ++ if (ret < 0) ++ goto free_kvlist; ++ pt->filter->d_port = (uint16_t) v.val; ++ } ++ ++ cnt1 = rte_kvargs_count(kvlist, PDUMP_PROTO_ARG); ++ if (cnt1 == 1) { ++ ret = rte_kvargs_process(kvlist, PDUMP_PROTO_ARG, ++ &parse_proto, pt); ++ if (ret < 0) ++ goto free_kvlist; ++ } + num_tuples++; free_kvlist: -@@ -486,6 +637,8 @@ struct parse_val { +@@ -510,6 +659,8 @@ cleanup_rings(void) rte_ring_free(pt->rx_ring); if (pt->tx_ring) rte_ring_free(pt->tx_ring); @@ -211,15 +220,7 @@ index ccf2a1d..7e72c16 100644 } } -@@ -527,6 +680,7 @@ struct parse_val { - } - - } -+ - cleanup_rings(); - } - -@@ -789,20 +943,20 @@ struct parse_val { +@@ -837,20 +988,20 @@ enable_pdump(void) pt->queue, RTE_PDUMP_FLAG_RX, pt->rx_ring, @@ -244,7 +245,7 @@ index ccf2a1d..7e72c16 100644 } } else if (pt->dir == RTE_PDUMP_FLAG_RX) { if (pt->dump_by_type == DEVICE_ID) -@@ -810,22 +964,22 @@ struct parse_val { +@@ -858,22 +1009,22 @@ enable_pdump(void) pt->device_id, pt->queue, pt->dir, pt->rx_ring, @@ -271,21 +272,8 @@ index ccf2a1d..7e72c16 100644 } if (ret < 0 || ret1 < 0) { cleanup_pdump_resources(); -diff --git a/config/common_base b/config/common_base -index d12ae98..5c15ea0 100644 ---- a/config/common_base -+++ b/config/common_base -@@ -451,7 +451,7 @@ CONFIG_RTE_LIBRTE_PMD_NULL=y - # - # Compile software PMD backed by PCAP files - # --CONFIG_RTE_LIBRTE_PMD_PCAP=n -+CONFIG_RTE_LIBRTE_PMD_PCAP=y - - # - # Compile example software rings based PMD diff --git a/lib/librte_pdump/rte_pdump.c b/lib/librte_pdump/rte_pdump.c -index 6c3a885..d9a3258 100644 +index b3c8d5c..b73fb8f 100644 --- a/lib/librte_pdump/rte_pdump.c +++ b/lib/librte_pdump/rte_pdump.c @@ -9,6 +9,10 @@ @@ -299,13 +287,13 @@ index 6c3a885..d9a3258 100644 #include "rte_pdump.h" -@@ -132,6 +136,133 @@ struct pdump_response { - return m_dup; - } +@@ -69,6 +73,132 @@ static struct pdump_rxtx_cbs { + } rx_cbs[RTE_MAX_ETHPORTS][RTE_MAX_QUEUES_PER_PORT], + tx_cbs[RTE_MAX_ETHPORTS][RTE_MAX_QUEUES_PER_PORT]; -+static int ++static int +inet_addr_equal(int af, const union addr *a1, -+ const union addr *a2) ++ const union addr *a2) +{ + switch (af) { + case AF_INET: @@ -328,99 +316,99 @@ index 6c3a885..d9a3258 100644 + default: + return -1; + } -+ ++ + return -1; +} +static int +pdump_filter(struct rte_mbuf *m, struct pdump_filter *filter) +{ -+ struct ether_hdr *eth_hdr; ++ struct rte_ether_hdr *eth_hdr; + struct vlan_eth_hdr *vlan_eth_hdr; + union addr s_addr, d_addr; + int prepend = 0; + uint16_t type = 0; + uint16_t iph_len = 0; -+ uint8_t proto = 0; ++ uint8_t proto = 0; + + int af; + + if (filter->af == 0 && filter->s_port == 0 && -+ filter->d_port == 0 && filter->proto == 0 && -+ filter->proto_port == 0) ++ filter->d_port == 0 && filter->proto == 0 && ++ filter->proto_port == 0) + return 0; + -+ eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); ++ eth_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *); + + if (eth_hdr->ether_type == htons(ETH_P_8021Q)) { + prepend += sizeof(struct vlan_eth_hdr); + vlan_eth_hdr = rte_pktmbuf_mtod(m, struct vlan_eth_hdr *); + type = vlan_eth_hdr->h_vlan_encapsulated_proto; + } else { -+ prepend += sizeof(struct ether_hdr); -+ eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); ++ prepend += sizeof(struct rte_ether_hdr); ++ eth_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *); + type = eth_hdr->ether_type; + } + + if (rte_pktmbuf_adj(m, prepend) == NULL) + goto prepend; + -+ if (type == rte_cpu_to_be_16(ETHER_TYPE_ARP)) { -+ struct arp_hdr *arp = rte_pktmbuf_mtod(m, struct arp_hdr *); ++ if (type == rte_cpu_to_be_16(RTE_ETHER_TYPE_ARP)) { ++ struct rte_arp_hdr *arp = rte_pktmbuf_mtod(m, struct rte_arp_hdr *); + af = AF_INET; + s_addr.in.s_addr = arp->arp_data.arp_sip; + d_addr.in.s_addr = arp->arp_data.arp_tip; -+ } else if (type == rte_cpu_to_be_16(ETHER_TYPE_IPv4)) { -+ struct ipv4_hdr *ip4 = rte_pktmbuf_mtod(m, struct ipv4_hdr *); ++ } else if (type == rte_cpu_to_be_16(RTE_ETHER_TYPE_IPV4)) { ++ struct rte_ipv4_hdr *ip4 = rte_pktmbuf_mtod(m, struct rte_ipv4_hdr *); + af = AF_INET; + s_addr.in.s_addr = ip4->src_addr; + d_addr.in.s_addr = ip4->dst_addr; -+ proto = ip4->next_proto_id; -+ iph_len = (ip4->version_ihl & 0xf) << 2; -+ } else if (type == rte_cpu_to_be_16(ETHER_TYPE_IPv6)) { -+ struct ipv6_hdr *ip6 = rte_pktmbuf_mtod(m, struct ipv6_hdr *); ++ proto = ip4->next_proto_id; ++ iph_len = (ip4->version_ihl & 0xf) << 2; ++ } else if (type == rte_cpu_to_be_16(RTE_ETHER_TYPE_IPV6)) { ++ struct rte_ipv6_hdr *ip6 = rte_pktmbuf_mtod(m, struct rte_ipv6_hdr *); + af = AF_INET6; + rte_memcpy(&s_addr.in6, &ip6->src_addr, 16); + rte_memcpy(&d_addr.in6, &ip6->dst_addr, 16); -+ proto = ip6->proto; -+ iph_len = sizeof(struct ipv6_hdr); ++ proto = ip6->proto; ++ iph_len = sizeof(struct rte_ipv6_hdr); + } else { + goto prepend; + } + + /*filter*/ -+ if (!inet_is_addr_any(af, &filter->s_addr) && ++ if (!inet_is_addr_any(af, &filter->s_addr) && + !inet_addr_equal(af, &filter->s_addr, &s_addr)) + goto prepend; -+ if (!inet_is_addr_any(af, &filter->d_addr) && ++ if (!inet_is_addr_any(af, &filter->d_addr) && + !inet_addr_equal(af, &filter->d_addr, &d_addr)) + goto prepend; -+ if (!inet_is_addr_any(af, &filter->host_addr) && ++ if (!inet_is_addr_any(af, &filter->host_addr) && + !inet_addr_equal(af, &filter->host_addr, &s_addr) && + !inet_addr_equal(af, &filter->host_addr, &d_addr)) + goto prepend; + + if (filter->proto && filter->proto != proto) -+ goto prepend; -+ -+ if (filter->s_port || filter->d_port || filter->proto_port) { -+ if (proto != IPPROTO_TCP && proto != IPPROTO_UDP) + goto prepend; -+ struct udp_hdr _uh; -+ const struct udp_hdr *uh; -+ uh = rte_pktmbuf_read(m, iph_len, sizeof(_uh), &_uh); -+ if (uh == NULL) -+ goto prepend; -+ if (filter->s_port && filter->s_port != rte_cpu_to_be_16(uh->src_port)) -+ goto prepend; -+ -+ if (filter->d_port && filter->d_port != rte_cpu_to_be_16(uh->dst_port)) -+ goto prepend; -+ -+ if (filter->proto_port && -+ filter->proto_port != rte_cpu_to_be_16(uh->src_port) && -+ filter->proto_port != rte_cpu_to_be_16(uh->dst_port)) -+ goto prepend; -+ } ++ ++ if (filter->s_port || filter->d_port || filter->proto_port) { ++ if (proto != IPPROTO_TCP && proto != IPPROTO_UDP) ++ goto prepend; ++ struct rte_udp_hdr _uh; ++ const struct rte_udp_hdr *uh; ++ uh = rte_pktmbuf_read(m, iph_len, sizeof(_uh), &_uh); ++ if (uh == NULL) ++ goto prepend; ++ if (filter->s_port && filter->s_port != rte_cpu_to_be_16(uh->src_port)) ++ goto prepend; ++ ++ if (filter->d_port && filter->d_port != rte_cpu_to_be_16(uh->dst_port)) ++ goto prepend; ++ ++ if (filter->proto_port && ++ filter->proto_port != rte_cpu_to_be_16(uh->src_port) && ++ filter->proto_port != rte_cpu_to_be_16(uh->dst_port)) ++ goto prepend; ++ } + + rte_pktmbuf_prepend(m, prepend); + return 0; @@ -429,20 +417,19 @@ index 6c3a885..d9a3258 100644 + rte_pktmbuf_prepend(m, prepend); + return -1; +} -+ + static inline void pdump_copy(struct rte_mbuf **pkts, uint16_t nb_pkts, void *user_params) - { -@@ -148,6 +279,8 @@ struct pdump_response { +@@ -86,6 +216,8 @@ pdump_copy(struct rte_mbuf **pkts, uint16_t nb_pkts, void *user_params) ring = cbs->ring; mp = cbs->mp; for (i = 0; i < nb_pkts; i++) { + if (pdump_filter(pkts[i], cbs->filter) != 0) + continue; - p = pdump_pktmbuf_copy(pkts[i], mp); + p = rte_pktmbuf_copy(pkts[i], mp, 0, UINT32_MAX); if (p) dup_bufs[d_pkts++] = p; -@@ -184,7 +317,7 @@ struct pdump_response { +@@ -122,7 +254,7 @@ pdump_tx(uint16_t port __rte_unused, uint16_t qidx __rte_unused, static int pdump_register_rx_callbacks(uint16_t end_q, uint16_t port, uint16_t queue, struct rte_ring *ring, struct rte_mempool *mp, @@ -451,15 +438,15 @@ index 6c3a885..d9a3258 100644 { uint16_t qid; struct pdump_rxtx_cbs *cbs = NULL; -@@ -202,6 +335,7 @@ struct pdump_response { +@@ -140,6 +272,7 @@ pdump_register_rx_callbacks(uint16_t end_q, uint16_t port, uint16_t queue, } cbs->ring = ring; cbs->mp = mp; -+ cbs->filter = filter; ++ cbs->filter = filter; cbs->cb = rte_eth_add_first_rx_callback(port, qid, pdump_rx, cbs); if (cbs->cb == NULL) { -@@ -238,7 +372,7 @@ struct pdump_response { +@@ -176,7 +309,7 @@ pdump_register_rx_callbacks(uint16_t end_q, uint16_t port, uint16_t queue, static int pdump_register_tx_callbacks(uint16_t end_q, uint16_t port, uint16_t queue, struct rte_ring *ring, struct rte_mempool *mp, @@ -468,58 +455,58 @@ index 6c3a885..d9a3258 100644 { uint16_t qid; -@@ -257,6 +391,7 @@ struct pdump_response { +@@ -195,6 +328,7 @@ pdump_register_tx_callbacks(uint16_t end_q, uint16_t port, uint16_t queue, } cbs->ring = ring; cbs->mp = mp; -+ cbs->filter = filter; ++ cbs->filter = filter; cbs->cb = rte_eth_add_tx_callback(port, qid, pdump_tx, cbs); if (cbs->cb == NULL) { -@@ -300,6 +435,7 @@ struct pdump_response { +@@ -238,6 +372,7 @@ set_pdump_rxtx_cbs(const struct pdump_request *p) uint16_t operation; struct rte_ring *ring; struct rte_mempool *mp; -+ struct pdump_filter *filter; ++ struct pdump_filter *filter; flags = p->flags; operation = p->op; -@@ -315,6 +451,7 @@ struct pdump_response { +@@ -253,6 +388,7 @@ set_pdump_rxtx_cbs(const struct pdump_request *p) queue = p->data.en_v1.queue; ring = p->data.en_v1.ring; mp = p->data.en_v1.mp; -+ filter = p->data.en_v1.filter; ++ filter = p->data.en_v1.filter; } else { ret = rte_eth_dev_get_port_by_name(p->data.dis_v1.device, &port); -@@ -327,6 +464,7 @@ struct pdump_response { +@@ -265,6 +401,7 @@ set_pdump_rxtx_cbs(const struct pdump_request *p) queue = p->data.dis_v1.queue; ring = p->data.dis_v1.ring; mp = p->data.dis_v1.mp; -+ filter = p->data.dis_v1.filter; ++ filter = p->data.dis_v1.filter; } /* validation if packet capture is for all queues */ -@@ -358,7 +496,7 @@ struct pdump_response { +@@ -303,7 +440,7 @@ set_pdump_rxtx_cbs(const struct pdump_request *p) if (flags & RTE_PDUMP_FLAG_RX) { end_q = (queue == RTE_PDUMP_ALL_QUEUES) ? nb_rx_q : queue + 1; ret = pdump_register_rx_callbacks(end_q, port, queue, ring, mp, - operation); -+ filter, operation); ++ filter, operation); if (ret < 0) return ret; } -@@ -367,7 +505,7 @@ struct pdump_response { +@@ -312,7 +449,7 @@ set_pdump_rxtx_cbs(const struct pdump_request *p) if (flags & RTE_PDUMP_FLAG_TX) { end_q = (queue == RTE_PDUMP_ALL_QUEUES) ? nb_tx_q : queue + 1; ret = pdump_register_tx_callbacks(end_q, port, queue, ring, mp, - operation); -+ filter, operation); ++ filter, operation); if (ret < 0) return ret; } diff --git a/lib/librte_pdump/rte_pdump.h b/lib/librte_pdump/rte_pdump.h -index 673a2b0..633b48f 100644 +index 6b00fc1..3986b07 100644 --- a/lib/librte_pdump/rte_pdump.h +++ b/lib/librte_pdump/rte_pdump.h @@ -15,6 +15,8 @@ @@ -531,35 +518,38 @@ index 673a2b0..633b48f 100644 #ifdef __cplusplus extern "C" { -@@ -34,6 +36,31 @@ enum rte_pdump_socktype { - RTE_PDUMP_SOCKET_CLIENT = 2 +@@ -29,6 +31,31 @@ enum { + RTE_PDUMP_FLAG_RXTX = (RTE_PDUMP_FLAG_RX|RTE_PDUMP_FLAG_TX) }; +union addr { -+ struct in_addr in; -+ struct in6_addr in6; ++ struct in_addr in; ++ struct in6_addr in6; +}; + +struct pdump_filter { -+ int af; ++ int af; + union addr s_addr; + union addr d_addr; + union addr host_addr; //s_addr or d_addr + -+ uint8_t proto; -+ uint16_t proto_port; //s_port or d_port -+ uint16_t s_port; -+ uint16_t d_port; ++ uint8_t proto; ++ uint16_t proto_port; //s_port or d_port ++ uint16_t s_port; ++ uint16_t d_port; +}; + +struct vlan_eth_hdr { -+ unsigned char h_dest[ETH_ALEN]; -+ unsigned char h_source[ETH_ALEN]; -+ unsigned short h_vlan_proto; -+ unsigned short h_vlan_TCI; -+ unsigned short h_vlan_encapsulated_proto; ++ unsigned char h_dest[ETH_ALEN]; ++ unsigned char h_source[ETH_ALEN]; ++ unsigned short h_vlan_proto; ++ unsigned short h_vlan_TCI; ++ unsigned short h_vlan_encapsulated_proto; +}; + /** * Initialize packet capturing handling * +-- +1.8.3.1 + diff --git a/patch/dpdk-stable-20.11.1/0003-debug-enable-dpdk-eal-memory-debug.patch b/patch/dpdk-stable-20.11.1/0003-debug-enable-dpdk-eal-memory-debug.patch new file mode 100644 index 000000000..03ff38ba7 --- /dev/null +++ b/patch/dpdk-stable-20.11.1/0003-debug-enable-dpdk-eal-memory-debug.patch @@ -0,0 +1,59 @@ +From e31fd685ced591060571375c70c69cd8ccf8dad9 Mon Sep 17 00:00:00 2001 +From: huangyichen +Date: Thu, 1 Jul 2021 21:24:47 +0800 +Subject: [PATCH 3/6] debug: enable dpdk eal memory debug + +The patch is used for memory debug. To use the patch, configure meson with option +-Dc_args="-DRTE_MALLOC_DEBUG" when building dpdk. For example, + +meson -Dc_args="-DRTE_MALLOC_DEBUG" -Dbuildtype=debug -Dprefix=$(pwd)/dpdklib dpdkbuild +ninja -C dpdkbuild +--- + lib/librte_eal/common/rte_malloc.c | 4 ++++ + lib/librte_eal/include/rte_malloc.h | 15 +++++++++++++++ + 2 files changed, 19 insertions(+) + +diff --git a/lib/librte_eal/common/rte_malloc.c b/lib/librte_eal/common/rte_malloc.c +index 9d39e58..2b6d1ab 100644 +--- a/lib/librte_eal/common/rte_malloc.c ++++ b/lib/librte_eal/common/rte_malloc.c +@@ -30,6 +30,10 @@ + #include "eal_memcfg.h" + #include "eal_private.h" + ++int rte_memmory_ok(void *addr) ++{ ++ return malloc_elem_cookies_ok(RTE_PTR_SUB(addr, MALLOC_ELEM_HEADER_LEN)); ++} + + /* Free the memory space back to heap */ + static void +diff --git a/lib/librte_eal/include/rte_malloc.h b/lib/librte_eal/include/rte_malloc.h +index 3af64f8..671e4f2 100644 +--- a/lib/librte_eal/include/rte_malloc.h ++++ b/lib/librte_eal/include/rte_malloc.h +@@ -248,6 +248,21 @@ rte_calloc_socket(const char *type, size_t num, size_t size, unsigned align, int + __rte_alloc_size(2, 3); + + /** ++ * Check the header/tailer cookies of memory pointed to by the provided pointer. ++ * ++ * This pointer must have been returned by a previous call to ++ * rte_malloc(), rte_zmalloc(), rte_calloc() or rte_realloc(). ++ * ++ * @param ptr ++ * The pointer to memory to be checked. ++ * @return ++ * - true if the header/tailer cookies are OK. ++ * - Otherwise, false. ++ */ ++int ++rte_memmory_ok(void *ptr); ++ ++/** + * Frees the memory space pointed to by the provided pointer. + * + * This pointer must have been returned by a previous call to +-- +1.8.3.1 + diff --git a/patch/dpdk-stable-20.11.1/0004-ixgbe_flow-patch-ixgbe-fdir-rte_flow-for-dpvs.patch b/patch/dpdk-stable-20.11.1/0004-ixgbe_flow-patch-ixgbe-fdir-rte_flow-for-dpvs.patch new file mode 100644 index 000000000..ef7eda6d4 --- /dev/null +++ b/patch/dpdk-stable-20.11.1/0004-ixgbe_flow-patch-ixgbe-fdir-rte_flow-for-dpvs.patch @@ -0,0 +1,256 @@ +From 965c6ebd04d49ba578bab321ea87768669a2c7d1 Mon Sep 17 00:00:00 2001 +From: huangyichen +Date: Fri, 2 Jul 2021 11:55:47 +0800 +Subject: [PATCH 4/6] ixgbe_flow: patch ixgbe fdir rte_flow for dpvs + +1. Ignore fdir flow rule priority attribute. +2. Use different fdir soft-id for flow rules configured for the same queue. +3. Disable fdir mask settings by rte_flow. +4. Allow IPv6 to pass flow rule ETH item validation. +5. TCP & UDP flow item dest port = 0 is invalid of ixgbe_parse_ntuple_filter() +6. Safe free ixgbe_flow_list item of MARCO RTE_MALLOC_DEBUG is define (configure meson with option -Dc_args="-DRTE_MALLOC_DEBUG") +--- + drivers/net/ixgbe/ixgbe_flow.c | 119 ++++++++++++++++++++++++++++++++++++----- + 1 file changed, 105 insertions(+), 14 deletions(-) + +diff --git a/drivers/net/ixgbe/ixgbe_flow.c b/drivers/net/ixgbe/ixgbe_flow.c +index 9aeb2e4..481a06f 100644 +--- a/drivers/net/ixgbe/ixgbe_flow.c ++++ b/drivers/net/ixgbe/ixgbe_flow.c +@@ -2,7 +2,6 @@ + * Copyright(c) 2010-2016 Intel Corporation + */ + +-#include + #include + #include + #include +@@ -15,6 +14,7 @@ + #include + #include + ++#include + #include + #include + #include +@@ -468,6 +468,29 @@ cons_parse_ntuple_filter(const struct rte_flow_attr *attr, + } + + tcp_spec = item->spec; ++ /* ++ * DPVS filted by fdir is expected, ++ * With dpvs single worker mode pattern had set: ++ * ----------------------------------------------- ++ * ITEM Spec Mask ++ * ETH NULL NULL ++ * IPV4|6 src_addr 0 0 ++ * dst_addr laddr 0xFFFFFFFF ++ * UDP|TCP src_port 0 0 ++ * dst_port 0 0 ++ * END ++ * ----------------------------------------------- ++ * It should return error here ++ * And continue by ixgbe_parse_fdir_filter() ++ * */ ++ if (tcp_spec->hdr.dst_port == 0 && ++ tcp_mask->hdr.dst_port == 0) { ++ memset(filter, 0, sizeof(struct rte_eth_ntuple_filter)); ++ rte_flow_error_set(error, EINVAL, ++ RTE_FLOW_ERROR_TYPE_ITEM, ++ item, "Not supported by ntuple filter"); ++ return -rte_errno; ++ } + filter->dst_port = tcp_spec->hdr.dst_port; + filter->src_port = tcp_spec->hdr.src_port; + filter->tcp_flags = tcp_spec->hdr.tcp_flags; +@@ -501,6 +524,30 @@ cons_parse_ntuple_filter(const struct rte_flow_attr *attr, + filter->src_port_mask = udp_mask->hdr.src_port; + + udp_spec = item->spec; ++ /* ++ * DPVS filted by fdir is expected, ++ * With dpvs single worker mode pattern had set: ++ * ----------------------------------------------- ++ * ITEM Spec Mask ++ * ETH NULL NULL ++ * IPV4|6 src_addr 0 0 ++ * dst_addr laddr 0xFFFFFFFF ++ * UDP|TCP src_port 0 0 ++ * dst_port 0 0 ++ * END ++ * ----------------------------------------------- ++ * It should return error here ++ * And continue by ixgbe_parse_fdir_filter() ++ * */ ++ ++ if (udp_spec->hdr.dst_port == 0 && ++ udp_mask->hdr.dst_port == 0) { ++ memset(filter, 0, sizeof(struct rte_eth_ntuple_filter)); ++ rte_flow_error_set(error, EINVAL, ++ RTE_FLOW_ERROR_TYPE_ITEM, ++ item, "Not supported by ntuple filter"); ++ return -rte_errno; ++ } + filter->dst_port = udp_spec->hdr.dst_port; + filter->src_port = udp_spec->hdr.src_port; + } else if (item->type == RTE_FLOW_ITEM_TYPE_SCTP) { +@@ -1419,11 +1466,8 @@ ixgbe_parse_fdir_act_attr(const struct rte_flow_attr *attr, + + /* not supported */ + if (attr->priority) { +- memset(rule, 0, sizeof(struct ixgbe_fdir_rule)); +- rte_flow_error_set(error, EINVAL, +- RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY, +- attr, "Not support priority."); +- return -rte_errno; ++ PMD_DRV_LOG(INFO, "ixgbe flow doesn't support priority %d " ++ "(priority must be 0), ignore and continue....\n", attr->priority); + } + + /* check if the first not void action is QUEUE or DROP. */ +@@ -1642,7 +1686,7 @@ ixgbe_parse_fdir_filter_normal(struct rte_eth_dev *dev, + * value. So, we need not do anything for the not provided fields later. + */ + memset(rule, 0, sizeof(struct ixgbe_fdir_rule)); +- memset(&rule->mask, 0xFF, sizeof(struct ixgbe_hw_fdir_mask)); ++ memset(&rule->mask, 0, sizeof(struct ixgbe_hw_fdir_mask)); /* mask default zero */ + rule->mask.vlan_tci_mask = 0; + rule->mask.flex_bytes_mask = 0; + +@@ -1760,6 +1804,8 @@ ixgbe_parse_fdir_filter_normal(struct rte_eth_dev *dev, + } + } else { + if (item->type != RTE_FLOW_ITEM_TYPE_IPV4 && ++ /* Signature mode supports IPv6. */ ++ item->type != RTE_FLOW_ITEM_TYPE_IPV6 && + item->type != RTE_FLOW_ITEM_TYPE_VLAN) { + memset(rule, 0, sizeof(struct ixgbe_fdir_rule)); + rte_flow_error_set(error, EINVAL, +@@ -1815,6 +1861,10 @@ ixgbe_parse_fdir_filter_normal(struct rte_eth_dev *dev, + */ + rule->ixgbe_fdir.formatted.flow_type = + IXGBE_ATR_FLOW_TYPE_IPV4; ++ ++ /* Update flow rule mode by global param. */ ++ rule->mode = dev->data->dev_conf.fdir_conf.mode; ++ + /*Not supported last point for range*/ + if (item->last) { + rte_flow_error_set(error, EINVAL, +@@ -1888,6 +1938,9 @@ ixgbe_parse_fdir_filter_normal(struct rte_eth_dev *dev, + rule->ixgbe_fdir.formatted.flow_type = + IXGBE_ATR_FLOW_TYPE_IPV6; + ++ /* Update flow rule mode by global param. */ ++ rule->mode = dev->data->dev_conf.fdir_conf.mode; ++ + /** + * 1. must signature match + * 2. not support last +@@ -2748,12 +2801,45 @@ ixgbe_parse_fdir_filter_tunnel(const struct rte_flow_attr *attr, + return ixgbe_parse_fdir_act_attr(attr, actions, rule, error); + } + ++static inline int ++ixgbe_fdir_rule_patch(struct rte_eth_dev *dev, struct ixgbe_fdir_rule *rule) ++{ ++ static uint32_t softid[IXGBE_MAX_RX_QUEUE_NUM] = { 0 }; ++ ++ if (!rule) ++ return 0; ++ ++ if (!dev || !dev->data) ++ return -EINVAL; ++ if (rule->queue >= IXGBE_MAX_RX_QUEUE_NUM) ++ return -EINVAL; ++ ++ /* Soft-id for different rx-queue should be different. */ ++ rule->soft_id = softid[rule->queue]++; ++ ++ /* Disable mask config from rte_flow. ++ * FIXME: ++ * Ixgbe only supports one global mask, all the masks should be the same. ++ * Generally, fdir masks should be configured globally before port start. ++ * But the rte_flow configures masks at flow creation. So we disable fdir ++ * mask configs in rte_flow and configure it globally when port start. ++ * Refer to `ixgbe_dev_start/ixgbe_fdir_configure` for details. The global ++ * masks are configured into device initially with user specified params. ++ */ ++ rule->b_mask = 0; ++ ++ /* Use user-defined mode. */ ++ rule->mode = dev->data->dev_conf.fdir_conf.mode; ++ ++ return 0; ++} ++ + static int + ixgbe_parse_fdir_filter(struct rte_eth_dev *dev, + const struct rte_flow_attr *attr, + const struct rte_flow_item pattern[], + const struct rte_flow_action actions[], +- struct ixgbe_fdir_rule *rule, ++ struct ixgbe_fdir_rule *rule, bool b_patch, + struct rte_flow_error *error) + { + int ret; +@@ -2787,13 +2873,18 @@ step_next: + rule->ixgbe_fdir.formatted.dst_port != 0)) + return -ENOTSUP; + +- if (fdir_mode == RTE_FDIR_MODE_NONE || +- fdir_mode != rule->mode) ++ if (fdir_mode == RTE_FDIR_MODE_NONE) + return -ENOTSUP; + + if (rule->queue >= dev->data->nb_rx_queues) + return -ENOTSUP; + ++ if (ret) ++ return ret; ++ ++ if (b_patch) ++ return ixgbe_fdir_rule_patch(dev, rule); ++ + return ret; + } + +@@ -3128,7 +3219,7 @@ ixgbe_flow_create(struct rte_eth_dev *dev, + + memset(&fdir_rule, 0, sizeof(struct ixgbe_fdir_rule)); + ret = ixgbe_parse_fdir_filter(dev, attr, pattern, +- actions, &fdir_rule, error); ++ actions, &fdir_rule, true, error); + if (!ret) { + /* A mask cannot be deleted. */ + if (fdir_rule.b_mask) { +@@ -3299,7 +3390,7 @@ ixgbe_flow_validate(struct rte_eth_dev *dev, + + memset(&fdir_rule, 0, sizeof(struct ixgbe_fdir_rule)); + ret = ixgbe_parse_fdir_filter(dev, attr, pattern, +- actions, &fdir_rule, error); ++ actions, &fdir_rule, false, error); + if (!ret) + return 0; + +@@ -3335,7 +3426,7 @@ ixgbe_flow_destroy(struct rte_eth_dev *dev, + struct ixgbe_eth_syn_filter_ele *syn_filter_ptr; + struct ixgbe_eth_l2_tunnel_conf_ele *l2_tn_filter_ptr; + struct ixgbe_fdir_rule_ele *fdir_rule_ptr; +- struct ixgbe_flow_mem *ixgbe_flow_mem_ptr; ++ struct ixgbe_flow_mem *ixgbe_flow_mem_ptr, *next_ptr; + struct ixgbe_hw_fdir_info *fdir_info = + IXGBE_DEV_PRIVATE_TO_FDIR_INFO(dev->data->dev_private); + struct ixgbe_rss_conf_ele *rss_filter_ptr; +@@ -3432,7 +3523,7 @@ ixgbe_flow_destroy(struct rte_eth_dev *dev, + return ret; + } + +- TAILQ_FOREACH(ixgbe_flow_mem_ptr, &ixgbe_flow_list, entries) { ++ TAILQ_FOREACH_SAFE(ixgbe_flow_mem_ptr, &ixgbe_flow_list, entries, next_ptr) { + if (ixgbe_flow_mem_ptr->flow == pmd_flow) { + TAILQ_REMOVE(&ixgbe_flow_list, + ixgbe_flow_mem_ptr, entries); +-- +1.8.3.1 + diff --git a/patch/dpdk-stable-20.11.1/0005-bonding-allow-slaves-from-different-numa-nodes.patch b/patch/dpdk-stable-20.11.1/0005-bonding-allow-slaves-from-different-numa-nodes.patch new file mode 100644 index 000000000..473bec74c --- /dev/null +++ b/patch/dpdk-stable-20.11.1/0005-bonding-allow-slaves-from-different-numa-nodes.patch @@ -0,0 +1,50 @@ +From a6393a8d04f1c8a4b324782aa5e242e10043a197 Mon Sep 17 00:00:00 2001 +From: huangyichen +Date: Wed, 4 Aug 2021 15:16:04 +0800 +Subject: [PATCH 5/6] bonding: allow slaves from different numa nodes + +Note the patch may have a negative influnce on performance. +It's not a good practice to bonding slaves across numa nodes. +--- + drivers/net/bonding/rte_eth_bond_pmd.c | 18 ++++++++++++++++-- + 1 file changed, 16 insertions(+), 2 deletions(-) + +diff --git a/drivers/net/bonding/rte_eth_bond_pmd.c b/drivers/net/bonding/rte_eth_bond_pmd.c +index 057b1ad..53f8ba3 100644 +--- a/drivers/net/bonding/rte_eth_bond_pmd.c ++++ b/drivers/net/bonding/rte_eth_bond_pmd.c +@@ -1762,7 +1762,14 @@ slave_configure(struct rte_eth_dev *bonded_eth_dev, + + errval = rte_eth_rx_queue_setup(slave_eth_dev->data->port_id, q_id, + bd_rx_q->nb_rx_desc, +- rte_eth_dev_socket_id(slave_eth_dev->data->port_id), ++ // In spite of performance problem, bonding slaves had better to support ++ // slaves from different numa nodes. Considering that numa node on which ++ // the resources of bonding port is allocated from is specified by ++ // rte_eth_bond_create() at bonding creation, the slave's queue_setup ++ // would fail if specified with the slave's numa node id that is different ++ // from the one of the bonding port. See rte_eth_dma_zone_reserve() for ++ // details. ++ SOCKET_ID_ANY, + &(bd_rx_q->rx_conf), bd_rx_q->mb_pool); + if (errval != 0) { + RTE_BOND_LOG(ERR, +@@ -1778,7 +1785,14 @@ slave_configure(struct rte_eth_dev *bonded_eth_dev, + + errval = rte_eth_tx_queue_setup(slave_eth_dev->data->port_id, q_id, + bd_tx_q->nb_tx_desc, +- rte_eth_dev_socket_id(slave_eth_dev->data->port_id), ++ // In spite of performance problem, bonding slaves had better to support ++ // slaves from different numa nodes. Considering that numa node on which ++ // the resources of bonding port is allocated from is specified by ++ // rte_eth_bond_create() at bonding creation, the slave's queue_setup ++ // would fail if specified with the slave's numa node id that is different ++ // from the one of the bonding port. See rte_eth_dma_zone_reserve() for ++ // details. ++ SOCKET_ID_ANY, + &bd_tx_q->tx_conf); + if (errval != 0) { + RTE_BOND_LOG(ERR, +-- +1.8.3.1 + diff --git a/patch/dpdk-stable-20.11.1/0006-bonding-fix-bonding-mode-4-problems.patch b/patch/dpdk-stable-20.11.1/0006-bonding-fix-bonding-mode-4-problems.patch new file mode 100644 index 000000000..d2e53511a --- /dev/null +++ b/patch/dpdk-stable-20.11.1/0006-bonding-fix-bonding-mode-4-problems.patch @@ -0,0 +1,144 @@ +From 38db21e38a36527a0e2e26f01a4b1f1bfd10c3d6 Mon Sep 17 00:00:00 2001 +From: huangyichen +Date: Wed, 4 Aug 2021 15:14:04 +0800 +Subject: [PATCH 6/6] bonding: fix bonding mode 4 problems + +1. Fix lacp packet receipt problem that is disscussed in issue [#725](https://github.com/iqiyi/dpvs/issues/725) of iqiyi/dpvs in detail. +2. Don't drop multicast/broadcast packets when all-multicast isn't enabled in rx_burst_8023ad. +3. Don't drop lacp packets received from worker queues when dedicated queue enabled. +--- + drivers/net/bonding/rte_eth_bond_8023ad.c | 20 ++++++++------ + drivers/net/bonding/rte_eth_bond_pmd.c | 46 +++++++++++++++++++------------ + 2 files changed, 40 insertions(+), 26 deletions(-) + +diff --git a/drivers/net/bonding/rte_eth_bond_8023ad.c b/drivers/net/bonding/rte_eth_bond_8023ad.c +index 5fe004e..52bd960 100644 +--- a/drivers/net/bonding/rte_eth_bond_8023ad.c ++++ b/drivers/net/bonding/rte_eth_bond_8023ad.c +@@ -831,7 +831,6 @@ bond_mode_8023ad_periodic_cb(void *arg) + struct port *port; + struct rte_eth_link link_info; + struct rte_ether_addr slave_addr; +- struct rte_mbuf *lacp_pkt = NULL; + uint16_t slave_id; + uint16_t i; + +@@ -903,6 +902,7 @@ bond_mode_8023ad_periodic_cb(void *arg) + /* Find LACP packet to this port. Do not check subtype, + * it is done in function that queued packet + */ ++ struct rte_mbuf *lacp_pkt = NULL; + int retval = rte_ring_dequeue(port->rx_ring, + (void **)&lacp_pkt); + +@@ -911,15 +911,17 @@ bond_mode_8023ad_periodic_cb(void *arg) + + rx_machine_update(internals, slave_id, lacp_pkt); + } else { +- uint16_t rx_count = rte_eth_rx_burst(slave_id, +- internals->mode4.dedicated_queues.rx_qid, +- &lacp_pkt, 1); +- +- if (rx_count == 1) +- bond_mode_8023ad_handle_slow_pkt(internals, +- slave_id, lacp_pkt); +- else ++ uint16_t rx_count, j; ++ struct rte_mbuf *lacp_pkt[16] = { NULL }; ++ ++ rx_count = rte_eth_rx_burst(slave_id, internals->mode4.dedicated_queues.rx_qid, ++ &lacp_pkt[0], sizeof(lacp_pkt)/sizeof(struct rte_mbuf *)); ++ if (rx_count > 0) { ++ for (j = 0; j < rx_count; j++) ++ bond_mode_8023ad_handle_slow_pkt(internals, slave_id, lacp_pkt[j]); ++ } else { + rx_machine_update(internals, slave_id, NULL); ++ } + } + + periodic_machine(internals, slave_id); +diff --git a/drivers/net/bonding/rte_eth_bond_pmd.c b/drivers/net/bonding/rte_eth_bond_pmd.c +index 53f8ba3..42e436c 100644 +--- a/drivers/net/bonding/rte_eth_bond_pmd.c ++++ b/drivers/net/bonding/rte_eth_bond_pmd.c +@@ -291,7 +291,6 @@ rx_burst_8023ad(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts, + + uint8_t collecting; /* current slave collecting status */ + const uint8_t promisc = rte_eth_promiscuous_get(internals->port_id); +- const uint8_t allmulti = rte_eth_allmulticast_get(internals->port_id); + uint8_t subtype; + uint16_t i; + uint16_t j; +@@ -322,6 +321,15 @@ rx_burst_8023ad(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts, + + /* Handle slow protocol packets. */ + while (j < num_rx_total) { ++ /* If packet is not pure L2 and is known: ++ * Such as OSPF protocol multcast packet, ++ * we want to handle it in user mode by ourselves, ++ * skip slow protocol flow */ ++ if ((bufs[j]->packet_type & ~RTE_PTYPE_L2_ETHER) != 0) { ++ j++; ++ continue; ++ } ++ + if (j + 3 < num_rx_total) + rte_prefetch0(rte_pktmbuf_mtod(bufs[j + 3], void *)); + +@@ -331,24 +339,26 @@ rx_burst_8023ad(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts, + /* Remove packet from array if: + * - it is slow packet but no dedicated rxq is present, + * - slave is not in collecting state, +- * - bonding interface is not in promiscuous mode: +- * - packet is unicast and address does not match, +- * - packet is multicast and bonding interface +- * is not in allmulti, ++ * - bonding interface is not in promiscuous mode and ++ * packet is unicast and address does not match, + */ + if (unlikely( +- (!dedicated_rxq && +- is_lacp_packets(hdr->ether_type, subtype, +- bufs[j])) || +- !collecting || +- (!promisc && +- ((rte_is_unicast_ether_addr(&hdr->d_addr) && +- !rte_is_same_ether_addr(bond_mac, +- &hdr->d_addr)) || +- (!allmulti && +- rte_is_multicast_ether_addr(&hdr->d_addr)))))) { +- ++ (is_lacp_packets(hdr->ether_type, subtype, bufs[j])) || ++ !collecting || (!promisc && ++ (rte_is_unicast_ether_addr(&hdr->d_addr) && ++ !rte_is_same_ether_addr(bond_mac, &hdr->d_addr))))) { + if (hdr->ether_type == ether_type_slow_be) { ++ if (dedicated_rxq) { ++ /* Error! Lacp packets should never appear here if ++ * dedicated queue enabled. This can be caused by ++ * a lack of support for ethertype rte_flow. Just ++ * issue a warning rather than dropping the packets ++ * so that the lacp state machine can work properly. ++ */ ++ RTE_BOND_LOG(WARNING, "receive lacp packets from queue %d " ++ "of port %d when dedicated queue enabled", ++ bd_rx_q->queue_id, slaves[idx]); ++ } + bond_mode_8023ad_handle_slow_pkt( + internals, slaves[idx], bufs[j]); + } else +@@ -1271,8 +1281,10 @@ skip_tx_ring: + slave_port_ids[i]; + } + +- if (unlikely(dist_slave_count < 1)) ++ if (unlikely(dist_slave_count < 1)) { ++ RTE_BOND_LOG(WARNING, "no distributing slaves on bonding port %d", internals->port_id); + return 0; ++ } + + return tx_burst_balance(queue, bufs, nb_bufs, dist_slave_port_ids, + dist_slave_count); +-- +1.8.3.1 + diff --git a/scripts/dpdk-build.sh b/scripts/dpdk-build.sh new file mode 100755 index 000000000..9e7741292 --- /dev/null +++ b/scripts/dpdk-build.sh @@ -0,0 +1,110 @@ +#!/bin/env bash +############################################################################ +# usage: $0 [-v dpdk-version] [-d] [-w work-directory] [-p patch-directory] + +build_options="-Denable_kmods=true" +debug_options="-Dbuildtype=debug -Dc_args=-DRTE_MALLOC_DEBUG" + +dpdkver=20.11.1 # default dpdk version (use stable version) +tarball=dpdk-${dpdkver}.tar.xz +srcdir=dpdk-stable-$dpdkver + +workdir=$(pwd)/dpdk # default work directory +patchdir=$(pwd)/patch/dpdk-stable-$dpdkver # default dpdk patch directory + + +function help() +{ + echo -e "\033[31musage: $0 [-d] [-w work-directory] [-p patch-directory]\033[0m" + echo -e "\033[31mOPTIONS:\033[0m" + echo -e "\033[31m -v specify the dpdk version, default $dpdkver\033[0m" + echo -e "\033[31m -d build dpdk libary with debug info\033[0m" + echo -e "\033[31m -w specify the work directory prefix, default $(pwd)\033[0m" + echo -e "\033[31m -p specify the dpdk patch directory, default $(pwd)/patch/dpdk-stable-$dpdkver\033[0m" +} + +function getfullpath() +{ + local dir=$(dirname $1) + local base=$(basename $1) + if test -d ${dir}; then + pushd ${dir} >/dev/null 2>&1 + echo ${PWD}/${base} + popd >/dev/null 2>&1 + return 0 + fi + return 1 +} + +function set_work_directory() +{ + [ ! -d $1 ] && return 1 + workdir=$(getfullpath $1)/dpdk +} + +function set_patch_directory() +{ + [ ! -d $1 ] && return 1 + patchdir=$(getfullpath $1) +} + +## parse args +while getopts "hw:p:dv:" OPT; do + case $OPT in + v) dpdkver=$OPTARG;; + w) set_work_directory $OPTARG ;; + p) set_patch_directory $OPTARG;; + d) build_options="${build_options} ${debug_options}";; + ?) help && exit 1;; + esac +done + +[ ! -d $workdir ] && mkdir $workdir +echo -e "\033[32mwork directory: $workdir\033[0m" + +[ ! -d $patchdir ] && echo -e "\033[31mdpdk patch file directory doesn't exist: $patchdir\033[0m" && exit 1 +echo -e "\033[32mdpdk patch directory: $patchdir\033[0m" + +echo -e "\033[32mbuild options: $build_options\033[0m" + +## prepare dpdk sources +cd $workdir +if [ ! -f $tarball ]; then + wget https://fast.dpdk.org/rel/$tarball -P $workdir + [ ! -f $tarball ] && echo -e "\033[31mfail to download $tarball\033[0m" && exit 1 +fi + +[ -d $workdir/$srcdir ] && echo -e "\033[33mremoving old source directory: $workdir/$srcdir\033[0m" && rm -rf $workdir/$srcdir +tar xf $tarball -C $workdir +echo "$(pwd), $workdir, $srcdir" +[ ! -d $workdir/$srcdir ] && echo -e "\033[31m$workdir/$srcdir directory is missing\033[0m" && exit 1 + +## patch dpdk +for patchfile in $(ls $patchdir) +do + patch -p1 -d $workdir/$srcdir < $patchdir/$patchfile + [ $? -ne 0 ] && echo -e "\033[31mfail to patch: $patchfile\033[0m" && exit 1 + echo -e "\033[32msucceed to patch: $patchfile\033[0m" +done + +## build dpdk and install +[ -d dpdkbuild ] && rm -rf dpdkbuild/* || mkdir dpdkbuild +[ -d dpdklib ] && rm -rf dpdklib/* || mkdir dpdklib + +meson $build_options -Dprefix=$(pwd)/dpdklib $srcdir dpdkbuild + +ninja -C dpdkbuild +[ $? -ne 0 ] && echo -e "\033[31mfail to build dpdk\033[0m" && exit 1 +ninja -C dpdkbuild install +[ $? -ne 0 ] && echo -e "\033[31mfail to install dpdk\033[0m" && exit 1 + +kni=dpdkbuild/kernel/linux/kni/rte_kni.ko +[ -f $kni ] && install -m 644 $kni dpdklib + +echo -e "DPDK library installed successfully into directory: \033[32m$(pwd)/dpdklib\033[0m" + +## export dpdk lib +echo -e "You can use this library in dpvs by running the command below:" +echo -e "\033[32m" +echo -e "export PKG_CONFIG_PATH=$(pwd)/dpdklib/lib64/pkgconfig" +echo -e "\033[0m" diff --git a/src/Makefile b/src/Makefile index 02f288c31..1ef63e5ea 100644 --- a/src/Makefile +++ b/src/Makefile @@ -34,10 +34,8 @@ DATE_STRING := $(shell date +%Y.%m.%d.%H:%M:%S) # same path of THIS Makefile SRCDIR := $(dir $(realpath $(firstword $(MAKEFILE_LIST)))) -ifeq ($(RTE_SDK),) - $(error "The variable RTE_SDK is not defined.") -endif -include $(RTE_SDK)/mk/rte.vars.mk +# Addtional libs below are needed when using dynamic link. +LIBS += -lpthread -lnuma -lrt -lm -ldl -lcrypto include $(SRCDIR)/config.mk include $(SRCDIR)/dpdk.mk @@ -62,9 +60,8 @@ else CFLAGS += -rdynamic endif -LIBS += -lpthread -lnuma -CFLAGS += $(INCDIRS) $(LIBS) +CFLAGS += $(INCDIRS) OBJS := $(shell find $(SRCDIR) -name '*.c' | sort) OBJS := $(patsubst %.c,%.o,$(OBJS)) @@ -73,7 +70,7 @@ all: $(TARGET) $(TARGET): $(OBJS) @echo " $(notdir $@)" - $(Q)$(CC) $(CFLAGS) $^ -o $@ + $(Q)$(CC) $(CFLAGS) $^ $(LIBS) -o $@ %.o: %.c @echo " $(notdir $@)" diff --git a/src/VERSION b/src/VERSION index 76f9440e2..8b6c402ce 100755 --- a/src/VERSION +++ b/src/VERSION @@ -1,33 +1,33 @@ #!/bin/sh - # program: dpvs -# Apr 26, 2021 +# Jul 28, 2021 +# +# Major changes: +# - Adapt dpvs to dpdk 20.11 (dpdk-stable-20.11.1). +# - Create branch DPVS-1.8-LTS to support dpdk 18.11. +# - Obsolete supports for dpdk 17.11. +# +# Featurs: +# - Dpvs: Add netif_flow module using generic flow api (rte_flow), and replace flow director with rte_flow. +# - Dpvs: Replace mbuf userdata with mbuf dynfields. +# - Dpvs: Adapt dpvs to several renamed type names in dpdk 20.11. +# - Dpvs: Update Makefiles to support dpdk 20.11. +# - Dpvs: Add config option "dedicated_queues" for bonding mode 4 (802.3ad). +# - Dpdk: Add helper script to facilitate dpdk build. +# - Dpdk: Porting patches to dpdk 20.11 and remove patches of previous dpdk versions (18.11, 17.11). +# - Dpdk: Patch dpdk ixgbe pmd driver to support dpvs's flow api. +# - Dpdk: Patch dpdk bonding mode 4 for mlx5 to fix crash problem when debug. +# - Keeaplived: Add UDP_CHECK health checker. +# - Docs: Refine tutorial doc of section 'Full-NAT with Keepalived (one-arm)'. +# - Docs: Update docs for dpvs use with dpdk 20.11. +# - Ci: Update dpvs ci to support dpdk 20.11. +# +# Bugfix: +# - Dpvs: Fix ipvs rr/wrr/wlc problem of uneven load distribution across dests. +# - Dpvs: Fix bonding mode 4 problem caused by LACP failure. # -# Features -# ---------- -# - CI: Enable CI workflow. -# - Dpvs: TC stability and performance enhancement. -# - Dpvs: TC supports ipv6 and ingress traffic. -# - Dpvs: Add document and examples for dpvs tc. -# - Dpvs: Add supports for ipvs whitelist. -# - Dpvs: Support icmp forwarding with icmp_fwd_core. -# - Dpvs: Support mtu config. -# - Dpvs: Obsolete dpdk 16.07 and 17.05.02. -# - Patch: Add eal memory debug patch for dpdk-stable-18.11.2. -# -# # Bugfix -# -------- -# - Dpvs: Fix traceroute problem of dpvs ip address. -# - Dpvs: Fix flags conflicts for ipvs conn/service/dest. -# - Dpvs: Reset tcp connection when syn-cookie check fails. -# - Dpvs: Use correct mbuf:l4_len for checkout offload. -# - Dpvs: Fix udp checksum problem for uoa when checksum offload is off. -# - Dpvs: Simplify checksum calculations and remove superfluous checksum functions. -# - Dpvs: Refactor netif recv procedure. -# - Dpvs: Fix debug level log problem. -# - Keepalived: Fix problem that local ip config doesn't take effect when restart. -# - Keepalived: Fix crash problem when tunnel is configured. -export VERSION=1.8 -export RELEASE=10 +export VERSION=1.9 +export RELEASE=0 echo $VERSION-$RELEASE diff --git a/src/common.c b/src/common.c index 2cde7a9dd..c10505cf4 100644 --- a/src/common.c +++ b/src/common.c @@ -130,34 +130,83 @@ bool is_power2(int num, int offset, int *lower) return ret; } -int linux_set_if_mac(const char *ifname, const unsigned char mac[ETH_ALEN]) +int linux_get_link_status(const char *ifname, int *if_flags, char *if_flags_str, size_t len) { int sock_fd; struct ifreq ifr = {}; - if (!ifname || !mac || !strncmp(ifname, "lo", 2)) + if (!ifname || !if_flags) return EDPVS_INVAL; + *if_flags= 0; + sock_fd = socket(PF_INET, SOCK_DGRAM, 0); if (sock_fd < 0) return EDPVS_SYSCALL; snprintf(ifr.ifr_name, sizeof(ifr.ifr_name), "%s", ifname); + if (ioctl(sock_fd, SIOCGIFFLAGS, &ifr)) { + fprintf(stderr, "%s: fail to get %s's flags -- %s\n", + __func__, ifname, strerror(errno)); + close(sock_fd); + return EDPVS_IO; + } + close(sock_fd); + + *if_flags = ifr.ifr_flags; + + if (if_flags_str) { + int idx = 0; + idx += snprintf(&if_flags_str[idx], len-idx-1, "%s:", ifname); + if(*if_flags & IFF_UP) + idx += snprintf(&if_flags_str[idx], len-idx-1, " UP"); + if(*if_flags & IFF_MULTICAST) + idx += snprintf(&if_flags_str[idx], len-idx-1, " MULTICAST"); + if(*if_flags & IFF_BROADCAST) + idx += snprintf(&if_flags_str[idx], len-idx-1, " BROADCAST"); + if(*if_flags & IFF_LOOPBACK) + idx += snprintf(&if_flags_str[idx], len-idx-1, " LOOPBACK"); + if(*if_flags & IFF_POINTOPOINT) + idx += snprintf(&if_flags_str[idx], len-idx-1, " P2P"); + } + + return EDPVS_OK; +} + +int linux_set_if_mac(const char *ifname, const unsigned char mac[ETH_ALEN]) +{ + int err; + int sock_fd, if_flags; + struct ifreq ifr = {}; + + if (!ifname || !mac || !strncmp(ifname, "lo", 2)) + return EDPVS_INVAL; + + err = linux_get_link_status(ifname, &if_flags, NULL, 0); + if (err != EDPVS_OK) + return err; + + if (!(if_flags & IFF_UP)) { + fprintf(stderr, "%s: skip MAC address update of link down device %s\n", + __func__, ifname); + return EDPVS_RESOURCE; + } + + sock_fd = socket(PF_INET, SOCK_DGRAM, 0); + if (sock_fd < 0) + return EDPVS_SYSCALL; + snprintf(ifr.ifr_name, sizeof(ifr.ifr_name), "%s", ifname); ifr.ifr_hwaddr.sa_family = 1; memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ALEN); if (ioctl(sock_fd, SIOCSIFHWADDR, &ifr)) { - /* DPDK 18.11, 'kni_net_process_request' is called when updating - * device's mac address, in which 'wait_event_interruptible_timeout' - * is used to wait for setting results, which may easily get timeout and - * return fail. We ignore the error here and return OK nevertheless.*/ - fprintf(stderr, "%s: fail to set %s's MAC address: %s\n", + fprintf(stderr, "%s: fail to set %s's MAC address -- %s\n", __func__, ifname, strerror(errno)); close(sock_fd); - return EDPVS_OK; + return EDPVS_IO; } - close(sock_fd); + return EDPVS_OK; } diff --git a/src/config.mk b/src/config.mk index c9871fc96..930353ac6 100644 --- a/src/config.mk +++ b/src/config.mk @@ -21,7 +21,7 @@ # TODO: use standard way to define compile flags. # -CONFIG_MLX5=n +CONFIG_IXGEB_PMD=y CONFIG_PDUMP=y CFLAGS += -D DPVS_MAX_SOCKET=2 @@ -46,6 +46,11 @@ CFLAGS += -D DPVS_MAX_LCORE=64 #CFLAGS += -D CONFIG_DPVS_MP_DEBUG #CFLAGS += -D CONFIG_ICMP_REDIRECT_CORE +# for ixgbe nic +ifeq ($(CONFIG_IXGEB_PMD), y) +CFLAGS += -D CONFIG_DPVS_FDIR +endif + ifeq ($(CONFIG_PDUMP), y) CFLAGS += -D CONFIG_DPVS_PDUMP endif @@ -53,3 +58,4 @@ endif GCC_MAJOR = $(shell echo __GNUC__ | $(CC) -E -x c - | tail -n 1) GCC_MINOR = $(shell echo __GNUC_MINOR__ | $(CC) -E -x c - | tail -n 1) GCC_VERSION = $(GCC_MAJOR)$(GCC_MINOR) + diff --git a/src/ctrl.c b/src/ctrl.c index c3a61b894..e1ded4e1f 100644 --- a/src/ctrl.c +++ b/src/ctrl.c @@ -852,7 +852,7 @@ int msg_type_table_print(char *buf, int len) rte_rwlock_read_lock(&mt_lock[ii][jj]); list_for_each_entry(mt, &mt_array[ii][jj], list) { memset(line, 0, sizeof(line)); - snprintf(line, sizeof(line), "mt_array[%-2d][%-4d] type %-8d mode %-12s" + snprintf(line, sizeof(line), "mt_array[%-2d][%-2d] type %-8d mode %-12s" " unicast_cb %p multicast_cb %p\n", ii, jj, mt->type, mt->mode == DPVS_MSG_UNICAST ? "UNICAST" : "MULITICAST", mt->unicast_msg_cb, mt->multicast_msg_cb); @@ -1058,7 +1058,7 @@ static inline int msg_init(void) /* lcore mask init */ slave_lcore_mask = 0; slave_lcore_nb = 0; - master_lcore = rte_get_master_lcore(); + master_lcore = rte_get_main_lcore(); netif_get_slave_lcores(&slave_lcore_nb, &slave_lcore_mask); if (slave_lcore_nb > MSG_MAX_LCORE_SUPPORTED) { @@ -1198,10 +1198,18 @@ static inline int sockopts_exist(struct dpvs_sockopts *sockopts) judge_id_betw(sockopts->set_opt_max, skopt->set_opt_min, skopt->set_opt_max)) { return 1; } + if (judge_id_betw(skopt->set_opt_min, sockopts->set_opt_min, sockopts->set_opt_max) || + judge_id_betw(skopt->set_opt_max, sockopts->set_opt_min, sockopts->set_opt_max)) { + return 1; + } if (judge_id_betw(sockopts->get_opt_min, skopt->get_opt_min, skopt->get_opt_max) || judge_id_betw(sockopts->get_opt_max, skopt->get_opt_min, skopt->get_opt_max)) { return 1; } + if (judge_id_betw(skopt->get_opt_min, sockopts->get_opt_min, sockopts->get_opt_max) || + judge_id_betw(skopt->get_opt_max, sockopts->get_opt_min, sockopts->get_opt_max)) { + return 1; + } } return 0; } diff --git a/src/dpdk.mk b/src/dpdk.mk index c96e75c7f..e3c4c2759 100644 --- a/src/dpdk.mk +++ b/src/dpdk.mk @@ -15,46 +15,35 @@ # GNU General Public License for more details. # -ifeq ($(RTE_SDK),) -$(error "The variable RTE_SDK is not defined.") -endif -# default target, may be overriden. -RTE_TARGET ?= build - -DPDKDIR := $(RTE_SDK)/$(RTE_TARGET) - -INCDIRS += -I $(DPDKDIR)/include - -CFLAGS += -include $(DPDKDIR)/include/rte_config.h - -LIBS += -L $(DPDKDIR)/lib - -LIBS += -Wl,--no-as-needed -fvisibility=default \ - -Wl,--whole-archive -lrte_pmd_vmxnet3_uio -lrte_pmd_i40e -lrte_pmd_ixgbe -lrte_pmd_ena \ - -lrte_pmd_e1000 -lrte_pmd_bnxt -lrte_pmd_ring -lrte_pmd_bond -lrte_ethdev -lrte_ip_frag \ - -Wl,--whole-archive -lrte_hash -lrte_kvargs -Wl,-lrte_mbuf -lrte_eal \ - -Wl,-lrte_mempool -lrte_ring -lrte_cmdline -lrte_cfgfile -lrte_kni \ - -lrte_mempool_ring -lrte_timer -lrte_net -Wl,-lrte_pmd_virtio \ - -lrte_pci -lrte_bus_pci -lrte_bus_vdev -lrte_lpm -lrte_pdump \ - -Wl,--no-whole-archive -lrt -lm -ldl -lcrypto - -ifeq ($(CONFIG_PDUMP), y) -LIBS += -Wl,--whole-archive -lrte_acl -lrte_member -lrte_eventdev -lrte_reorder -lrte_cryptodev \ - -lrte_vhost -lrte_pmd_pcap - -ifneq ("$(wildcard $(RTE_SDK)/$(RTE_TARGET)/lib/librte_bus_vmbus.a)", "") - LIBS += -lrte_bus_vmbus +# If the dpdklib isn't installed to the default location on your system, +# please specify PKG_CONFIG_PATH explicitly as below. +# +# LIBDPDKPC_PATH := /path/to/dpdk/build/lib/pkgconfig + +define PKG_CONFIG_ERR_MSG +DPDK library was not found. +If dpdk has installed already, please ensure the libdpdk.pc file could be found by `pkg-config`. +You may fix the problem by setting LIBDPDKPC_PATH (in file src/dpdk.mk) to the path of libdpdk.pc file explicitly +endef + +# It's noted that pkg-config version 0.29.2 is recommended, +# pkg-config 0.27.1 would mess up the ld flags when linking dpvs. +PKGCONFIG_VERSION=$(shell pkg-config pkg-config --version) +ifneq "v$(PKGCONFIG_VERSION)" "v0.29.2" +$(warning "The pkg-config version is $(PKGCONFIG_VERSION) but 0.29.2 is recommended.") +ifeq "v$(PKGCONFIG_VERSION)" "v0.27.1" +$(error "pkg-config version $(PKGCONFIG_VERSION) isn't supported by dpvs, please use 0.29.2 instead.") endif - -ifneq ("$(wildcard $(RTE_SDK)/$(RTE_TARGET)/lib/librte_pmd_netvsc.a)", "") - LIBS += -lrte_pmd_netvsc endif -LIBS += -Wl,--no-whole-archive -lpcap +ifeq ($(shell pkg-config --exists libdpdk && echo 0),0) +CFLAGS += -DALLOW_EXPERIMENTAL_API $(shell pkg-config --cflags libdpdk) +LIBS += $(shell pkg-config --static --libs libdpdk) +else +ifneq ($(wildcard $(LIBDPDKPC_PATH)),) +CFLAGS += -DALLOW_EXPERIMENTAL_API $(shell PKG_CONFIG_PATH=$(LIBDPDKPC_PATH) pkg-config --cflags libdpdk) +LIBS += $(shell PKG_CONFIG_PATH=$(LIBDPDKPC_PATH) pkg-config --static --libs libdpdk) +else +$(error $(PKG_CONFIG_ERR_MSG)) endif - -ifeq ($(CONFIG_MLX5), y) -LIBS += -Wl,--whole-archive -lrte_pmd_mlx5 -Wl,--no-whole-archive -LIBS += -libverbs -lmlx5 -lmnl endif - diff --git a/src/eal_mem.c b/src/eal_mem.c index 44dd468fc..2574d0e36 100644 --- a/src/eal_mem.c +++ b/src/eal_mem.c @@ -20,6 +20,7 @@ #include #include #include +#include #include "conf/eal_mem.h" #include "eal_mem.h" #include "ctrl.h" @@ -60,7 +61,7 @@ static int dp_vs_fill_mem_seg_info(const struct rte_memseg_list *msl, const stru seg_ret = &eal_mem_segs->seg_info[eal_mem_segs->seg_num]; eal_mem_segs->seg_num++; - seg_ret->phys_addr = ms->phys_addr; + seg_ret->iova = ms->iova; seg_ret->virt_addr = ms->addr_64; seg_ret->len = ms->len; seg_ret->hugepage_sz = ms->hugepage_sz; @@ -84,7 +85,7 @@ static void dp_vs_fill_mem_zone_info(const struct rte_memzone *mz, void *arg) eal_mem_zones->zone_num++; memcpy(zone_ret->name, mz->name, EAL_MEM_NAME_LEN); - zone_ret->phys_addr = mz->phys_addr; + zone_ret->iova = mz->iova; zone_ret->virt_addr = mz->addr_64; zone_ret->len = mz->len; zone_ret->hugepage_sz = mz->hugepage_sz; @@ -110,7 +111,7 @@ static int dp_vs_get_eal_mem_seg(eal_all_mem_seg_ret_t *eal_mem_segs) } seg_ret = &eal_mem_segs->seg_info[eal_mem_segs->seg_num]; eal_mem_segs->seg_num++; - seg_ret->phys_addr = mcfg->memseg[i].phys_addr; + seg_ret->iova = mcfg->memseg[i].iova; seg_ret->virt_addr = mcfg->memseg[i].addr_64; seg_ret->len = mcfg->memseg[i].len; seg_ret->hugepage_sz = mcfg->memseg[i].hugepage_sz; @@ -171,7 +172,7 @@ static int dp_vs_get_eal_mem_pool(eal_all_mem_pool_ret_t *eal_mem_pools) if (NULL == mempool_list) return -1; - rte_rwlock_read_lock(RTE_EAL_MEMPOOL_RWLOCK); + rte_mcfg_mempool_read_lock(); eal_mem_pools->mempool_num = 0; TAILQ_FOREACH(te, mempool_list, next) { mp = (struct rte_mempool *) te->data; @@ -186,7 +187,7 @@ static int dp_vs_get_eal_mem_pool(eal_all_mem_pool_ret_t *eal_mem_pools) mempool_ret->trailer_size = mp->trailer_size; mempool_ret->private_data_size = mp->private_data_size; } - rte_rwlock_read_unlock(RTE_EAL_MEMPOOL_RWLOCK); + rte_mcfg_mempool_read_unlock(); return 0; } @@ -201,7 +202,7 @@ static int dp_vs_get_eal_mem_ring(eal_all_mem_ring_ret_t *eal_mem_rings) ring_list = RTE_TAILQ_LOOKUP("RTE_RING", rte_ring_list); - rte_rwlock_read_lock(RTE_EAL_TAILQ_RWLOCK); + rte_mcfg_tailq_read_lock(); eal_mem_rings->ring_num = 0; TAILQ_FOREACH(te, ring_list, next) { r = (struct rte_ring *)te->data; @@ -217,7 +218,7 @@ static int dp_vs_get_eal_mem_ring(eal_all_mem_ring_ret_t *eal_mem_rings) ring_ret->used = rte_ring_count(r); ring_ret->avail = rte_ring_free_count(r); } - rte_rwlock_read_unlock(RTE_EAL_TAILQ_RWLOCK); + rte_mcfg_tailq_read_unlock(); return 0; } diff --git a/src/icmp.c b/src/icmp.c index 240311fc8..81d7598d4 100644 --- a/src/icmp.c +++ b/src/icmp.c @@ -39,7 +39,7 @@ struct icmp_ctrl { #ifdef CONFIG_DPVS_ICMP_DEBUG static void icmp_dump_hdr(const struct rte_mbuf *mbuf) { - struct icmp_hdr *ich = rte_pktmbuf_mtod(mbuf, struct icmp_hdr *); + struct rte_icmp_hdr *ich = rte_pktmbuf_mtod(mbuf, struct rte_icmp_hdr *); lcoreid_t lcore = rte_lcore_id(); fprintf(stderr, "lcore %d port %d icmp type %u code %u id %u seq %u\n", @@ -52,12 +52,12 @@ static void icmp_dump_hdr(const struct rte_mbuf *mbuf) static int icmp_echo(struct rte_mbuf *mbuf) { - struct ipv4_hdr *iph = mbuf->userdata; - struct icmp_hdr *ich = rte_pktmbuf_mtod(mbuf, struct icmp_hdr *); + struct rte_ipv4_hdr *iph = MBUF_USERDATA(mbuf, struct rte_ipv4_hdr *, MBUF_FIELD_PROTO); + struct rte_icmp_hdr *ich = rte_pktmbuf_mtod(mbuf, struct rte_icmp_hdr *); uint16_t csum; struct flow4 fl4; - if (ich->icmp_type != IP_ICMP_ECHO_REQUEST || ich->icmp_code != 0) { + if (ich->icmp_type != RTE_IP_ICMP_ECHO_REQUEST || ich->icmp_code != 0) { RTE_LOG(WARNING, ICMP, "%s: not echo-request\n", __func__); goto errout; } @@ -79,7 +79,7 @@ static int icmp_echo(struct rte_mbuf *mbuf) goto errout; } - ich->icmp_type = IP_ICMP_ECHO_REPLY; + ich->icmp_type = RTE_IP_ICMP_ECHO_REPLY; /* recalc the checksum */ ich->icmp_cksum = 0; csum = rte_raw_cksum(ich, mbuf->pkt_len); @@ -164,8 +164,8 @@ static struct icmp_ctrl icmp_ctrls[MAX_ICMP_CTRL] = { /* @imbuf is input (original) IP packet to trigger ICMP. */ void icmp_send(struct rte_mbuf *imbuf, int type, int code, uint32_t info) { - struct route_entry *rt = imbuf->userdata; - struct ipv4_hdr *iph = ip4_hdr(imbuf); + struct route_entry *rt = MBUF_USERDATA(imbuf, struct route_entry *, MBUF_FIELD_ROUTE); + struct rte_ipv4_hdr *iph = ip4_hdr(imbuf); eth_type_t etype = imbuf->packet_type; /* FIXME: use other field ? */ struct in_addr saddr; uint8_t tos; @@ -196,7 +196,7 @@ void icmp_send(struct rte_mbuf *imbuf, int type, int code, uint32_t info) } /* reply only first fragment. */ - if (iph->fragment_offset & htons(IPV4_HDR_OFFSET_MASK)) + if (iph->fragment_offset & htons(RTE_IPV4_HDR_OFFSET_MASK)) return; if (type > NR_ICMP_TYPES) @@ -249,7 +249,7 @@ void icmp_send(struct rte_mbuf *imbuf, int type, int code, uint32_t info) RTE_LOG(DEBUG, ICMP, "%s: no memory.\n", __func__); return; } - mbuf->userdata = NULL; + mbuf_userdata_reset(mbuf); assert(rte_pktmbuf_headroom(mbuf) >= 128); /* for L2/L3 */ /* prepare ICMP message */ @@ -265,7 +265,7 @@ void icmp_send(struct rte_mbuf *imbuf, int type, int code, uint32_t info) /* copy as much as we can without exceeding 576 (min-MTU) */ room = fl4.fl4_oif->mtu > 576 ? 576 : fl4.fl4_oif->mtu; - room -= sizeof(struct ipv4_hdr); + room -= sizeof(struct rte_ipv4_hdr); room -= sizeof(struct icmphdr); /* we support only linear mbuf now, use m.data_len @@ -291,13 +291,13 @@ void icmp_send(struct rte_mbuf *imbuf, int type, int code, uint32_t info) static int icmp_rcv(struct rte_mbuf *mbuf) { - struct ipv4_hdr *iph = mbuf->userdata; - struct icmp_hdr *ich; + struct rte_ipv4_hdr *iph = MBUF_USERDATA(mbuf, struct rte_ipv4_hdr *, MBUF_FIELD_PROTO); + struct rte_icmp_hdr *ich; struct icmp_ctrl *ctrl; - if (mbuf_may_pull(mbuf, sizeof(struct icmp_hdr)) != 0) + if (mbuf_may_pull(mbuf, sizeof(struct rte_icmp_hdr)) != 0) goto invpkt; - ich = rte_pktmbuf_mtod(mbuf, struct icmp_hdr *); + ich = rte_pktmbuf_mtod(mbuf, struct rte_icmp_hdr *); if (unlikely(!iph)) { RTE_LOG(WARNING, ICMP, "%s: no ipv4 header\n", __func__); @@ -395,7 +395,7 @@ void icmp_redirect_proc(void *args) /* Remove ether_hdr at the beginning of an mbuf */ data_off = mbuf->data_off; - if (unlikely(NULL == rte_pktmbuf_adj(mbuf, sizeof(struct ether_hdr)))) { + if (unlikely(NULL == rte_pktmbuf_adj(mbuf, sizeof(struct rte_ether_hdr)))) { rte_pktmbuf_free(mbuf); return; } diff --git a/src/iftraf.c b/src/iftraf.c index d0a8c829c..a03277402 100644 --- a/src/iftraf.c +++ b/src/iftraf.c @@ -675,7 +675,7 @@ static int iftraf_pkt_deliver(int af, struct rte_mbuf *mbuf, struct netif_port * portid_t devid; if (af == AF_INET) { - struct ipv4_hdr *ip4h = ip4_hdr(mbuf); + struct rte_ipv4_hdr *ip4h = ip4_hdr(mbuf); if (unlikely(ip4h->next_proto_id != IPPROTO_TCP && ip4h->next_proto_id != IPPROTO_UDP)) { diff --git a/src/inetaddr.c b/src/inetaddr.c index 9e32c99fb..13081e4e4 100644 --- a/src/inetaddr.c +++ b/src/inetaddr.c @@ -179,7 +179,7 @@ static int ifa_add_del_mcast(struct inet_ifaddr *ifa, bool add) { int err; union inet_addr iaddr; - struct ether_addr eaddr; + struct rte_ether_addr eaddr; /* for ipv6 only */ if (ifa->af != AF_INET6) @@ -219,7 +219,7 @@ int idev_add_mcast_init(void *args) int err; struct inet_device *idev; union inet_addr all_nodes, all_routers; - struct ether_addr eaddr_nodes, eaddr_routers; + struct rte_ether_addr eaddr_nodes, eaddr_routers; struct netif_port *dev = (struct netif_port *) args; @@ -600,7 +600,7 @@ static int ifa_add_route6(struct inet_ifaddr *ifa) static int ifa_add_route(struct inet_ifaddr *ifa) { /* set route from master */ - if (unlikely(rte_lcore_id() != rte_get_master_lcore())) + if (unlikely(rte_lcore_id() != rte_get_main_lcore())) return EDPVS_OK; switch (ifa->af) { @@ -666,7 +666,7 @@ static int ifa_del_route6(struct inet_ifaddr *ifa) static int ifa_del_route(struct inet_ifaddr *ifa) { /* set route from master */ - if (unlikely(rte_lcore_id() != rte_get_master_lcore())) + if (unlikely(rte_lcore_id() != rte_get_main_lcore())) return EDPVS_OK; switch (ifa->af) { @@ -686,7 +686,7 @@ static int inet_ifaddr_dad_completed(void *arg) struct inet_ifaddr *ifa = arg; /* only master's ifa scheduled ifa->dad_timer */ - assert(rte_lcore_id() == rte_get_master_lcore()); + assert(rte_lcore_id() == rte_get_main_lcore()); dpvs_timer_cancel_nolock(&ifa->dad_timer, true); ifa->flags &= ~(IFA_F_TENTATIVE | IFA_F_OPTIMISTIC | IFA_F_DADFAILED); @@ -719,7 +719,7 @@ static void inet_ifaddr_dad_start(struct inet_ifaddr *ifa) ifa->flags |= IFA_F_TENTATIVE | IFA_F_OPTIMISTIC; /* timing and sending dad on master only */ - if (cid != rte_get_master_lcore()) + if (cid != rte_get_main_lcore()) return; dpvs_time_rand_delay(&tv, 1000000); @@ -765,7 +765,7 @@ static int ifa_expire(void *arg) struct inet_ifaddr *ifa = (struct inet_ifaddr *)arg; /* only master's ifa scheduled ifa->timer */ - assert(cid == rte_get_master_lcore()); + assert(cid == rte_get_main_lcore()); err = inet_addr_del(ifa->af, ifa->idev->dev, &ifa->addr, ifa->plen); if (err != EDPVS_OK) { @@ -783,7 +783,7 @@ static int ifa_entry_add(const struct ifaddr_action *param) struct inet_device *idev; struct inet_ifaddr *ifa; struct timeval timeo = { 0 }; - bool is_master = (rte_lcore_id() == rte_get_master_lcore()); + bool is_master = (rte_lcore_id() == rte_get_main_lcore()); if (!param || !param->dev || !ifa_prefix_check(param->af, ¶m->addr, param->plen)) @@ -900,7 +900,7 @@ static int ifa_entry_mod(const struct ifaddr_action *param) struct inet_device *idev; struct inet_ifaddr *ifa; struct timeval timeo = { 0 }; - bool is_master = (rte_lcore_id() == rte_get_master_lcore()); + bool is_master = (rte_lcore_id() == rte_get_main_lcore()); if (!param || !param->dev || !ifa_prefix_check(param->af, ¶m->addr, param->plen)) @@ -1021,7 +1021,7 @@ static int ifa_entry_sync(const struct ifaddr_action *param) /* only support snyc flags now */ ifa->flags = param->flags; if ((ifa->flags & IFA_F_DADFAILED) && - (rte_lcore_id() == rte_get_master_lcore())) + (rte_lcore_id() == rte_get_main_lcore())) dpvs_timer_cancel(&ifa->dad_timer, true); ifa_put(ifa); @@ -1041,7 +1041,7 @@ static void ifa_free(struct inet_ifaddr **ifa_p) /* remove @ifa from @ifa_expired_list */ list_del_init(&ifa->h_list); - if (cid == rte_get_master_lcore()) { + if (cid == rte_get_main_lcore()) { /* it's safe to cancel timer not pending but zeroed */ dpvs_timer_cancel(&ifa->dad_timer, true); dpvs_timer_cancel(&ifa->timer, true); @@ -1105,7 +1105,7 @@ static void fill_ifaddr_entry(lcoreid_t cid, const struct inet_ifaddr *ifa, stru entry->ifa_entry.prefered_lft = 0; } else { struct timeval now, diff; - dpvs_time_now(&now, rte_lcore_id() == rte_get_master_lcore()); + dpvs_time_now(&now, rte_lcore_id() == rte_get_main_lcore()); timersub(&now, &ifa->tstemp, &diff); entry->ifa_entry.valid_lft = ifa->valid_lft - diff.tv_sec; entry->ifa_entry.prefered_lft = ifa->prefered_lft - diff.tv_sec; @@ -1227,7 +1227,7 @@ static int ifa_msg_sync_cb(struct dpvs_msg *msg) struct ifaddr_action *param; /* sync from master lcore only */ - assert(rte_lcore_id() == rte_get_master_lcore()); + assert(rte_lcore_id() == rte_get_main_lcore()); if (!msg || msg->len != sizeof(*param)) return EDPVS_INVAL; @@ -1459,7 +1459,7 @@ static int inet_addr_sync(const struct ifaddr_action *param) struct dpvs_msg *msg; cid = rte_lcore_id(); - mid = rte_get_master_lcore(); + mid = rte_get_main_lcore(); /* call from master */ if (cid == mid) @@ -1473,7 +1473,7 @@ static int inet_addr_sync(const struct ifaddr_action *param) return EDPVS_NOMEM; } - err = msg_send(msg, rte_get_master_lcore(), DPVS_MSG_F_ASYNC, NULL); + err = msg_send(msg, rte_get_main_lcore(), DPVS_MSG_F_ASYNC, NULL); if (err != EDPVS_OK) RTE_LOG(WARNING, IFA, "[%02d] %s: msg_send failed\n", cid, __func__); @@ -1488,7 +1488,7 @@ static int ifaddr_get_basic(struct inet_device *idev, struct inet_addr_data_arra /* convey ifa data on master lcore */ cid = rte_lcore_id(); - assert(cid == rte_get_master_lcore()); + assert(cid == rte_get_main_lcore()); if (idev) ifa_cnt = idev->ifa_cnt[cid]; @@ -1785,7 +1785,7 @@ static struct dpvs_msg_type ifa_msg_types[] = { .type = MSG_TYPE_IFA_SYNC, .prio = MSG_PRIO_NORM, .mode = DPVS_MSG_UNICAST, - //.cid = rte_get_master_lcore(), + //.cid = rte_get_main_lcore(), .unicast_msg_cb = ifa_msg_sync_cb, .multicast_msg_cb = NULL } @@ -1815,7 +1815,7 @@ int inet_addr_init(void) INIT_LIST_HEAD(&ifa_expired_list[cid]); } - ifa_msg_types[2].cid = rte_get_master_lcore(); + ifa_msg_types[2].cid = rte_get_main_lcore(); if ((err = sockopt_register(&ifa_sockopts)) != EDPVS_OK) { RTE_LOG(ERR, IFA, "%s: fail to register ifa_sockopts -- %s\n", diff --git a/src/ip_gre.c b/src/ip_gre.c index e8412b635..a0219846c 100644 --- a/src/ip_gre.c +++ b/src/ip_gre.c @@ -271,7 +271,7 @@ static int gre_rcv(struct rte_mbuf *mbuf) if (hlen < 0) goto drop; - iph = mbuf->userdata; /* see ipv4_local_in_fin */ + iph = MBUF_USERDATA(mbuf, struct iphdr *, MBUF_FIELD_PROTO); /* see ipv4_local_in_fin */ assert(iph->version == 4 && iph->protocol == IPPROTO_GRE); tnl = ip_tunnel_lookup(&gre_tunnel_tab, mbuf->port, tpi.flags, diff --git a/src/ip_tunnel.c b/src/ip_tunnel.c index e7d85f3a4..62acbcd6c 100644 --- a/src/ip_tunnel.c +++ b/src/ip_tunnel.c @@ -197,7 +197,7 @@ static struct netif_port *tunnel_create(struct ip_tunnel_tab *tab, set before tunnel_bind_dev */ if (tnl->link) { dev->flag |= tnl->link->flag; - ether_addr_copy(&tnl->link->addr, &dev->addr); + rte_ether_addr_copy(&tnl->link->addr, &dev->addr); } dev->flag |= NETIF_PORT_FLAG_RUNNING; /* XXX */ dev->flag |= NETIF_PORT_FLAG_NO_ARP; @@ -333,7 +333,7 @@ static int tunnel_update_pmtu(struct netif_port *dev, struct rte_mbuf *mbuf, else mtu = rt->mtu ? : dev->mtu; - if (mbuf->packet_type == ETHER_TYPE_IPv4) { + if (mbuf->packet_type == RTE_ETHER_TYPE_IPV4) { if ((iiph->frag_off & htons(IP_DF)) && mtu < pkt_size) { icmp_send(mbuf, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); return EDPVS_FRAG; @@ -362,7 +362,7 @@ static int tunnel_xmit(struct rte_mbuf *mbuf, __be32 src, __be32 dst, oiph->daddr = dst; oiph->saddr = src; oiph->ttl = ttl; - oiph->id = ip4_select_id((struct ipv4_hdr *)oiph); + oiph->id = ip4_select_id((struct rte_ipv4_hdr *)oiph); return ipv4_local_out(mbuf); } @@ -805,7 +805,7 @@ int ip_tunnel_xmit(struct rte_mbuf *mbuf, struct netif_port *dev, assert(mbuf && dev && tiph); - if (mbuf->packet_type == ETHER_TYPE_IPv4) + if (mbuf->packet_type == RTE_ETHER_TYPE_IPV4) iiph = rte_pktmbuf_mtod_offset(mbuf, struct iphdr *, tnl->hlen); connected = tiph->daddr != 0; @@ -852,7 +852,7 @@ int ip_tunnel_xmit(struct rte_mbuf *mbuf, struct netif_port *dev, /* refer route in mbuf and this reference will be put later. */ route4_get(rt); - mbuf->userdata = (void *)rt; + MBUF_USERDATA(mbuf, struct route_entry *, MBUF_FIELD_ROUTE) = rt; err = tunnel_update_pmtu(dev, mbuf, rt, tiph->frag_off, iiph); if (err != EDPVS_OK) diff --git a/src/ipip.c b/src/ipip.c index d8b978b35..2b4e0dcbd 100644 --- a/src/ipip.c +++ b/src/ipip.c @@ -72,7 +72,7 @@ static int ipip_rcv(struct rte_mbuf *mbuf) /* IPv4's upper layer can use @userdata for IP header, * see ipv4_local_in_fin() */ - iph = mbuf->userdata; + iph = MBUF_USERDATA(mbuf, struct iphdr *, MBUF_FIELD_PROTO); assert(iph->version == 4 && iph->protocol == IPPROTO_IPIP); tnl = ip_tunnel_lookup(&ipip_tunnel_tab, mbuf->port, TUNNEL_F_NO_KEY, diff --git a/src/ipset.c b/src/ipset.c index c94624cc6..762d6a185 100644 --- a/src/ipset.c +++ b/src/ipset.c @@ -509,8 +509,8 @@ int ipset_init(void) for (i = 0; i < IPSET_TAB_SIZE; i++) INIT_LIST_HEAD(&this_ipset_table_lcore[i]); - rte_eal_mp_remote_launch(ipset_lcore_init, NULL, CALL_MASTER); - RTE_LCORE_FOREACH_SLAVE(cid) { + rte_eal_mp_remote_launch(ipset_lcore_init, NULL, CALL_MAIN); + RTE_LCORE_FOREACH_WORKER(cid) { if ((err = rte_eal_wait_lcore(cid)) < 0) { RTE_LOG(WARNING, IPSET, "%s: lcore %d: %s.\n", __func__, cid, dpvs_strerror(err)); @@ -542,8 +542,8 @@ int ipset_term(void) if ((err = sockopt_unregister(&ipset_sockopts)) != EDPVS_OK) return err; - rte_eal_mp_remote_launch(ipset_flush_lcore, NULL, CALL_MASTER); - RTE_LCORE_FOREACH_SLAVE(cid) { + rte_eal_mp_remote_launch(ipset_flush_lcore, NULL, CALL_MAIN); + RTE_LCORE_FOREACH_WORKER(cid) { if ((err = rte_eal_wait_lcore(cid)) < 0) { RTE_LOG(WARNING, IPSET, "%s: lcore %d: %s.\n", __func__, cid, dpvs_strerror(err)); diff --git a/src/ipv4.c b/src/ipv4.c index e3fb35849..652a351c3 100644 --- a/src/ipv4.c +++ b/src/ipv4.c @@ -107,7 +107,7 @@ static void ip4_show_hdr(const char *func, const struct rte_mbuf *mbuf) { portid_t port; lcoreid_t lcore; - struct ipv4_hdr *iph; + struct rte_ipv4_hdr *iph; char saddr[16], daddr[16]; port = mbuf->port; @@ -122,7 +122,7 @@ static void ip4_show_hdr(const char *func, const struct rte_mbuf *mbuf) RTE_LOG(DEBUG, IPV4, "%s: [%d] port %u ipv4 hl %u tos %u tot %u " "id %u ttl %u prot %u src %s dst %s\n", - func, lcore, port, IPV4_HDR_IHL_MASK & iph->version_ihl, + func, lcore, port, RTE_IPV4_HDR_IHL_MASK & iph->version_ihl, iph->type_of_service, ntohs(iph->total_length), ntohs(iph->packet_id), iph->time_to_live, iph->next_proto_id, saddr, daddr); @@ -155,8 +155,8 @@ static int ipv4_local_in_fin(struct rte_mbuf *mbuf) { int err, hlen; const struct inet_protocol *prot; - struct ipv4_hdr *iph = ip4_hdr(mbuf); - struct route_entry *rt = mbuf->userdata; + struct rte_ipv4_hdr *iph = ip4_hdr(mbuf); + struct route_entry *rt = MBUF_USERDATA(mbuf, struct route_entry *, MBUF_FIELD_ROUTE); int (*handler)(struct rte_mbuf *mbuf) = NULL; /* remove network header */ @@ -165,7 +165,7 @@ static int ipv4_local_in_fin(struct rte_mbuf *mbuf) if (rt) { route4_put(rt); - mbuf->userdata = NULL; + MBUF_USERDATA(mbuf, struct route_entry *, MBUF_FIELD_ROUTE) = NULL; } /* @@ -174,13 +174,11 @@ static int ipv4_local_in_fin(struct rte_mbuf *mbuf) * but mbuf do not. Consider the length of header is variable * (e.g., IPv4 options), it's not make sence for every layer * to parse lower layer's headers. - * note if mbuf->userdata is not suitable, we can use 'extened' - * mbuf to save offsets like skb. * * BTW, if netif_port_get() called too many times we can also * use 'extend' mbuf to save 'netif_port *dev'. */ - mbuf->userdata = iph; + MBUF_USERDATA(mbuf, struct rte_ipv4_hdr *, MBUF_FIELD_PROTO) = iph; /* deliver to upper layer */ rte_spinlock_lock(&inet_prot_lock); @@ -203,8 +201,9 @@ static int ipv4_local_in_fin(struct rte_mbuf *mbuf) static int ipv4_local_in(struct rte_mbuf *mbuf) { int err; - struct route_entry *rt = mbuf->userdata; + struct route_entry *rt; + rt = MBUF_USERDATA(mbuf, struct route_entry *, MBUF_FIELD_ROUTE); if (ip4_is_frag(ip4_hdr(mbuf))) { if ((err = ip4_defrag(mbuf, IP_DEFRAG_LOCAL_IN)) != EDPVS_OK) { route4_put(rt); @@ -218,10 +217,11 @@ static int ipv4_local_in(struct rte_mbuf *mbuf) static int ipv4_output_fin2(struct rte_mbuf *mbuf) { - struct route_entry *rt = mbuf->userdata; + struct route_entry *rt; int err; struct in_addr nexthop; + rt = MBUF_USERDATA(mbuf, struct route_entry *, MBUF_FIELD_ROUTE); if (rt->gw.s_addr == htonl(INADDR_ANY)) nexthop.s_addr = ip4_hdr(mbuf)->dst_addr; else @@ -236,7 +236,7 @@ static int ipv4_output_fin2(struct rte_mbuf *mbuf) * note it was used in RX path for eth_type_t. * really confusing. */ - mbuf->packet_type = ETHER_TYPE_IPv4; + mbuf->packet_type = RTE_ETHER_TYPE_IPV4; mbuf->l3_len = ip4_hdrlen(mbuf); err = neigh_output(AF_INET, (union inet_addr *)&nexthop, mbuf, rt->port); @@ -246,7 +246,7 @@ static int ipv4_output_fin2(struct rte_mbuf *mbuf) static int ipv4_output_fin(struct rte_mbuf *mbuf) { - struct route_entry *rt = mbuf->userdata; + struct route_entry *rt = MBUF_USERDATA(mbuf, struct route_entry *, MBUF_FIELD_ROUTE); if (mbuf->pkt_len > rt->mtu) return ipv4_fragment(mbuf, rt->mtu, ipv4_output_fin2); @@ -256,7 +256,7 @@ static int ipv4_output_fin(struct rte_mbuf *mbuf) int ipv4_output(struct rte_mbuf *mbuf) { - struct route_entry *rt = mbuf->userdata; + struct route_entry *rt = MBUF_USERDATA(mbuf, struct route_entry *, MBUF_FIELD_ROUTE); assert(rt); IP4_UPD_PO_STATS(out, mbuf->pkt_len); @@ -277,8 +277,8 @@ static int ipv4_forward_fin(struct rte_mbuf *mbuf) static int ipv4_forward(struct rte_mbuf *mbuf) { - struct ipv4_hdr *iph = ip4_hdr(mbuf); - struct route_entry *rt = mbuf->userdata; + struct rte_ipv4_hdr *iph = ip4_hdr(mbuf); + struct route_entry *rt = MBUF_USERDATA(mbuf, struct route_entry *, MBUF_FIELD_ROUTE); uint32_t mtu, csum; assert(rt && rt->port); @@ -291,7 +291,7 @@ static int ipv4_forward(struct rte_mbuf *mbuf) mtu = rt->mtu; if (mbuf->pkt_len > mtu - && (iph->fragment_offset & htons(IPV4_HDR_DF_FLAG))) { + && (iph->fragment_offset & htons(RTE_IPV4_HDR_DF_FLAG))) { IP4_INC_STATS(fragfails); icmp_send(mbuf, ICMP_DEST_UNREACH, ICMP_UNREACH_NEEDFRAG, htonl(mtu)); goto drop; @@ -327,7 +327,7 @@ int ipv4_rcv_fin(struct rte_mbuf *mbuf) { int err; struct route_entry *rt = NULL; - struct ipv4_hdr *iph = ip4_hdr(mbuf); + struct rte_ipv4_hdr *iph = ip4_hdr(mbuf); eth_type_t etype = mbuf->packet_type; /* FIXME: use other field ? */ /* input route decision */ @@ -346,7 +346,7 @@ int ipv4_rcv_fin(struct rte_mbuf *mbuf) } /* use extended mbuf if have more data then @rt */ - mbuf->userdata = (void *)rt; + MBUF_USERDATA(mbuf, struct route_entry *, MBUF_FIELD_ROUTE) = rt; if (rt->flag & RTF_LOCALIN) { return ipv4_local_in(mbuf); @@ -378,7 +378,7 @@ static int ipv4_rcv(struct rte_mbuf *mbuf, struct netif_port *port) #ifdef CONFIG_ICMP_REDIRECT_CORE struct icmphdr *ich, _icmph; #endif - struct ipv4_hdr *iph; + struct rte_ipv4_hdr *iph; uint16_t hlen, len; eth_type_t etype = mbuf->packet_type; /* FIXME: use other field ? */ assert(mbuf); @@ -390,13 +390,13 @@ static int ipv4_rcv(struct rte_mbuf *mbuf, struct netif_port *port) IP4_UPD_PO_STATS(in, mbuf->pkt_len); iftraf_pkt_in(AF_INET, mbuf, port); - if (mbuf_may_pull(mbuf, sizeof(struct ipv4_hdr)) != 0) + if (mbuf_may_pull(mbuf, sizeof(struct rte_ipv4_hdr)) != 0) goto inhdr_error; iph = ip4_hdr(mbuf); hlen = ip4_hdrlen(mbuf); - if (((iph->version_ihl) >> 4) != 4 || hlen < sizeof(struct ipv4_hdr)) + if (((iph->version_ihl) >> 4) != 4 || hlen < sizeof(struct rte_ipv4_hdr)) goto inhdr_error; if (mbuf_may_pull(mbuf, hlen) != 0) @@ -421,7 +421,7 @@ static int ipv4_rcv(struct rte_mbuf *mbuf, struct netif_port *port) goto drop; } } - mbuf->userdata = NULL; + mbuf_userdata_reset(mbuf); mbuf->l3_len = hlen; #ifdef CONFIG_DPVS_IP_HEADER_DEBUG @@ -436,7 +436,7 @@ static int ipv4_rcv(struct rte_mbuf *mbuf, struct netif_port *port) if (unlikely(!ich)) goto drop; if (ich->type == ICMP_ECHOREPLY || ich->type == ICMP_ECHO) { - rte_pktmbuf_prepend(mbuf, (uint16_t)sizeof(struct ether_hdr)); + rte_pktmbuf_prepend(mbuf, (uint16_t)sizeof(struct rte_ether_hdr)); icmp_recv_proc(mbuf); return EDPVS_OK; } @@ -456,7 +456,7 @@ static int ipv4_rcv(struct rte_mbuf *mbuf, struct netif_port *port) } static struct pkt_type ip4_pkt_type = { - //.type = rte_cpu_to_be_16(ETHER_TYPE_IPv4), + //.type = rte_cpu_to_be_16(RTE_ETHER_TYPE_IPV4), .func = ipv4_rcv, .port = NULL, }; @@ -486,7 +486,7 @@ int ipv4_init(void) if ((err = ipv4_frag_init()) != EDPVS_OK) return err; - ip4_pkt_type.type = htons(ETHER_TYPE_IPv4); + ip4_pkt_type.type = htons(RTE_ETHER_TYPE_IPV4); if ((err = netif_register_pkt(&ip4_pkt_type)) != EDPVS_OK) { ipv4_frag_term(); return err; @@ -509,7 +509,7 @@ int ipv4_term(void) return EDPVS_OK; } -uint32_t ip4_select_id(struct ipv4_hdr *iph) +uint32_t ip4_select_id(struct rte_ipv4_hdr *iph) { uint32_t hash, id; rte_atomic32_t *p_id; @@ -526,8 +526,9 @@ uint32_t ip4_select_id(struct ipv4_hdr *iph) int ipv4_local_out(struct rte_mbuf *mbuf) { - struct ipv4_hdr *iph = ip4_hdr(mbuf); - struct route_entry *rt = mbuf->userdata; + struct rte_ipv4_hdr *iph = ip4_hdr(mbuf); + struct route_entry *rt = MBUF_USERDATA(mbuf, + struct route_entry *, MBUF_FIELD_ROUTE); iph->total_length = htons(mbuf->pkt_len); @@ -543,7 +544,7 @@ int ipv4_local_out(struct rte_mbuf *mbuf) int ipv4_xmit(struct rte_mbuf *mbuf, const struct flow4 *fl4) { struct route_entry *rt; - struct ipv4_hdr *iph; + struct rte_ipv4_hdr *iph; if (!mbuf || !fl4) { if (mbuf) @@ -559,9 +560,9 @@ int ipv4_xmit(struct rte_mbuf *mbuf, const struct flow4 *fl4) IP4_INC_STATS(outnoroutes); return EDPVS_NOROUTE; } - mbuf->userdata = (void *)rt; + MBUF_USERDATA(mbuf, struct route_entry *, MBUF_FIELD_ROUTE) = (void *)rt; - iph = (struct ipv4_hdr *)rte_pktmbuf_prepend(mbuf, sizeof(struct ipv4_hdr)); + iph = (struct rte_ipv4_hdr *)rte_pktmbuf_prepend(mbuf, sizeof(struct rte_ipv4_hdr)); if (!iph) { rte_pktmbuf_free(mbuf); route4_put(rt); diff --git a/src/ipv4_frag.c b/src/ipv4_frag.c index d08a7fa88..ce9f06078 100644 --- a/src/ipv4_frag.c +++ b/src/ipv4_frag.c @@ -172,7 +172,7 @@ static struct ipv4_frag ip4_frags[DPVS_MAX_LCORE]; int ipv4_reassamble(struct rte_mbuf *mbuf) { struct rte_mbuf *asm_mbuf, *next, *seg, *prev; - struct ipv4_hdr *iph = ip4_hdr(mbuf); + struct rte_ipv4_hdr *iph = ip4_hdr(mbuf); assert(mbuf->l3_len > 0); @@ -199,7 +199,7 @@ int ipv4_reassamble(struct rte_mbuf *mbuf) rte_pktmbuf_free(asm_mbuf); return EDPVS_NOMEM; } - seg->userdata = NULL; + mbuf_userdata_reset(seg); for (prev = asm_mbuf; prev; prev = prev->next) if (prev->next == mbuf) break; @@ -259,15 +259,16 @@ int ipv4_reassamble(struct rte_mbuf *mbuf) int ipv4_fragment(struct rte_mbuf *mbuf, unsigned int mtu, int (*output)(struct rte_mbuf *)) { - struct ipv4_hdr *iph = ip4_hdr(mbuf); - struct route_entry *rt = mbuf->userdata; + struct rte_ipv4_hdr *iph = ip4_hdr(mbuf); + struct route_entry *rt = MBUF_USERDATA(mbuf, + struct route_entry *, MBUF_FIELD_ROUTE); struct rte_mbuf *frag; unsigned int left, len, hlen; int offset, err, from; void *to; assert(rt); - if (iph->fragment_offset & IPV4_HDR_DF_FLAG) { + if (iph->fragment_offset & RTE_IPV4_HDR_DF_FLAG) { icmp_send(mbuf, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); err = EDPVS_FRAG; @@ -295,11 +296,12 @@ int ipv4_fragment(struct rte_mbuf *mbuf, unsigned int mtu, err = EDPVS_NOMEM; goto out; } - frag->userdata = NULL; + mbuf_userdata_reset(frag); /* copy metadata from orig pkt */ route4_get(rt); - frag->userdata = rt; /* no need to hold before consume mbuf */ + /* no need to hold before consume mbuf */ + MBUF_USERDATA(mbuf, struct route_entry *, MBUF_FIELD_ROUTE) = rt; frag->port = mbuf->port; frag->ol_flags = 0; /* do not offload csum for frag */ frag->l2_len = mbuf->l2_len; @@ -330,7 +332,7 @@ int ipv4_fragment(struct rte_mbuf *mbuf, unsigned int mtu, /* TODO: if (offset == 0) ip_fragment_options(frag); */ if (left > 0) - iph->fragment_offset |= htons(IPV4_HDR_MF_FLAG); + iph->fragment_offset |= htons(RTE_IPV4_HDR_MF_FLAG); offset += len; from += len; diff --git a/src/ipv6/icmp6.c b/src/ipv6/icmp6.c index da894a2a5..20e4a2c4d 100644 --- a/src/ipv6/icmp6.c +++ b/src/ipv6/icmp6.c @@ -56,7 +56,7 @@ uint16_t icmp6_csum(struct ip6_hdr *iph, struct icmp6_hdr *ich) hdr.ip6_dst = iph->ip6_dst; csum = rte_raw_cksum(ich, l4_len); - csum += rte_ipv6_phdr_cksum((struct ipv6_hdr *)&hdr, 0); + csum += rte_ipv6_phdr_cksum((struct rte_ipv6_hdr *)&hdr, 0); csum = ((csum & 0xffff0000) >> 16) + (csum & 0xffff); csum = (~csum) & 0xffff; @@ -75,7 +75,7 @@ void icmp6_send_csum(struct ip6_hdr *shdr, struct icmp6_hdr *ich) l4_len = ntohs(shdr->ip6_plen); csum = rte_raw_cksum(ich, l4_len); - csum += rte_ipv6_phdr_cksum((struct ipv6_hdr *)shdr, 0); + csum += rte_ipv6_phdr_cksum((struct rte_ipv6_hdr *)shdr, 0); csum = ((csum & 0xffff0000) >> 16) + (csum & 0xffff); csum = (~csum) & 0xffff; @@ -221,7 +221,7 @@ void icmp6_send(struct rte_mbuf *imbuf, int type, int code, uint32_t info) RTE_LOG(DEBUG, ICMP6, "%s: no memory.\n", __func__); return; } - mbuf->userdata = NULL; + mbuf_userdata_reset(mbuf); assert(rte_pktmbuf_headroom(mbuf) >= 128); /* for L2/L3 */ ich = (struct icmp6_hdr*)rte_pktmbuf_append(mbuf, sizeof(struct icmp6_hdr));; if (!ich) { @@ -298,7 +298,7 @@ static int icmp6_echo_reply(struct rte_mbuf *mbuf, struct ip6_hdr *iph, static int icmp6_rcv(struct rte_mbuf *mbuf) { - struct ip6_hdr *iph = mbuf->userdata; + struct ip6_hdr *iph = MBUF_USERDATA(mbuf, struct ip6_hdr *, MBUF_FIELD_PROTO); struct icmp6_hdr *ich; assert(iph); diff --git a/src/ipv6/ipv6.c b/src/ipv6/ipv6.c index b9323363c..1fa712110 100644 --- a/src/ipv6/ipv6.c +++ b/src/ipv6/ipv6.c @@ -139,7 +139,7 @@ static void ip6_conf_disable(vector_t tokens) else RTE_LOG(WARNING, IPV6, "invalid ipv6:disable %s\n", str); - RTE_LOG(INFO, IPV6, "ipv6:disable = %s", conf_ipv6_disable ? "on" : "off"); + RTE_LOG(INFO, IPV6, "ipv6:disable = %s\n", conf_ipv6_disable ? "on" : "off"); FREE_PTR(str); } @@ -159,14 +159,14 @@ static int ip6_local_in_fin(struct rte_mbuf *mbuf) * and set it to IPv6 fixed header for upper layer. */ if (!ipv6_addr_is_multicast(&hdr->ip6_dst)) { - struct route6 *rt = mbuf->userdata; + struct route6 *rt = MBUF_USERDATA(mbuf, struct route6 *, MBUF_FIELD_ROUTE); if (rt) { route6_put(rt); - mbuf->userdata = NULL; + MBUF_USERDATA(mbuf, struct route6 *, MBUF_FIELD_ROUTE) = NULL; } } - mbuf->userdata = (void *)hdr; + MBUF_USERDATA(mbuf, struct ip6_hdr *, MBUF_FIELD_PROTO) = hdr; nexthdr = hdr->ip6_nxt; /* parse extension headers */ @@ -292,7 +292,7 @@ static inline unsigned int ip6_mtu_forward(struct route6 *rt) static int ip6_fragment(struct rte_mbuf *mbuf, uint32_t mtu, int (*out)(struct rte_mbuf *)) { - struct route6 *rt = mbuf->userdata; + struct route6 *rt = MBUF_USERDATA(mbuf, struct route6 *, MBUF_FIELD_ROUTE); /* TODO: */ @@ -319,16 +319,16 @@ static int ip6_output_fin2(struct rte_mbuf *mbuf) return EDPVS_INVAL; } - dev = mbuf->userdata; + dev = MBUF_USERDATA(mbuf, struct netif_port *, MBUF_FIELD_ROUTE); /* only support linklocal! */ nexthop = &hdr->ip6_dst; } else { - rt = mbuf->userdata; + rt = MBUF_USERDATA(mbuf, struct route6 *, MBUF_FIELD_ROUTE); dev = rt->rt6_dev; nexthop = ip6_rt_nexthop(rt, &hdr->ip6_dst); } - mbuf->packet_type = ETHER_TYPE_IPv6; + mbuf->packet_type = RTE_ETHER_TYPE_IPV6; err = neigh_output(AF_INET6, (union inet_addr *)nexthop, mbuf, dev); @@ -344,9 +344,9 @@ static int ip6_output_fin(struct rte_mbuf *mbuf) struct ip6_hdr *hdr = ip6_hdr(mbuf); if (ipv6_addr_is_multicast(&hdr->ip6_dst)) - mtu = ((struct netif_port *)mbuf->userdata)->mtu; + mtu = MBUF_USERDATA(mbuf, struct netif_port *, MBUF_FIELD_ROUTE)->mtu; else - mtu = ((struct route6 *)mbuf->userdata)->rt6_mtu; + mtu = MBUF_USERDATA(mbuf, struct route6 *, MBUF_FIELD_ROUTE)->rt6_mtu; if (mbuf->pkt_len > mtu) return ip6_fragment(mbuf, mtu, ip6_output_fin2); @@ -361,9 +361,9 @@ int ip6_output(struct rte_mbuf *mbuf) struct ip6_hdr *hdr = ip6_hdr(mbuf); if (ipv6_addr_is_multicast(&hdr->ip6_dst)) { - dev = mbuf->userdata; + dev = MBUF_USERDATA(mbuf, struct netif_port *, MBUF_FIELD_ROUTE); } else { - rt = mbuf->userdata; + rt = MBUF_USERDATA(mbuf, struct route6 *, MBUF_FIELD_ROUTE); dev = rt->rt6_dev; } @@ -389,12 +389,11 @@ int ip6_local_out(struct rte_mbuf *mbuf) struct ip6_hdr *hdr = ip6_hdr(mbuf); if (ipv6_addr_is_multicast(&hdr->ip6_dst)) - dev = mbuf->userdata; + dev = MBUF_USERDATA(mbuf, struct netif_port *, MBUF_FIELD_ROUTE); else - dev = ((struct route6 *)mbuf->userdata)->rt6_dev; + dev = MBUF_USERDATA(mbuf, struct route6 *, MBUF_FIELD_ROUTE)->rt6_dev; - return INET_HOOK(AF_INET6, INET_HOOK_LOCAL_OUT, mbuf, NULL, - dev, ip6_output); + return INET_HOOK(AF_INET6, INET_HOOK_LOCAL_OUT, mbuf, NULL, dev, ip6_output); } static int ip6_forward_fin(struct rte_mbuf *mbuf) @@ -408,7 +407,7 @@ static int ip6_forward_fin(struct rte_mbuf *mbuf) static int ip6_forward(struct rte_mbuf *mbuf) { struct ip6_hdr *hdr = ip6_hdr(mbuf); - struct route6 *rt = mbuf->userdata; + struct route6 *rt = MBUF_USERDATA(mbuf, struct route6 *, MBUF_FIELD_ROUTE); int addrtype; uint32_t mtu; @@ -502,7 +501,7 @@ static int ip6_rcv_fin(struct rte_mbuf *mbuf) * someday, we may use extended mbuf if have more L3 info * then route need to be saved into mbuf. */ - mbuf->userdata = (void *)rt; + MBUF_USERDATA(mbuf, struct route6 *, MBUF_FIELD_ROUTE) = rt; if (rt->rt6_flags & RTF_LOCALIN) { return ip6_local_in(mbuf); @@ -521,7 +520,7 @@ static int ip6_rcv_fin(struct rte_mbuf *mbuf) kni: if (rt) { route6_put(rt); - mbuf->userdata = NULL; + MBUF_USERDATA(mbuf, struct route6 *, MBUF_FIELD_ROUTE) = NULL; } return EDPVS_KNICONTINUE; } @@ -613,7 +612,7 @@ static int ip6_rcv(struct rte_mbuf *mbuf, struct netif_port *dev) * @userdata is used to save route info in L3. */ mbuf->l3_len = sizeof(*hdr); - mbuf->userdata = NULL; + MBUF_USERDATA(mbuf, struct route6 *, MBUF_FIELD_ROUTE) = NULL; /* hop-by-hop option header */ if (hdr->ip6_nxt == NEXTHDR_HOP) { @@ -655,7 +654,7 @@ int ipv6_init(void) return err; /* htons, cpu_to_be16 not work when struct initialization :( */ - ip6_pkt_type.type = htons(ETHER_TYPE_IPv6); + ip6_pkt_type.type = htons(RTE_ETHER_TYPE_IPV6); err = netif_register_pkt(&ip6_pkt_type); if (err) @@ -720,7 +719,8 @@ int ipv6_xmit(struct rte_mbuf *mbuf, struct flow6 *fl6) return EDPVS_NOTSUPP; } assert(fl6->fl6_oif); - mbuf->userdata = (void *)fl6->fl6_oif; + /* use mbuf userdata type MBUF_FIELD_ROUTE for saving spaces */ + MBUF_USERDATA(mbuf, struct netif_port *, MBUF_FIELD_ROUTE) = fl6->fl6_oif; dev = fl6->fl6_oif; } else { @@ -731,7 +731,7 @@ int ipv6_xmit(struct rte_mbuf *mbuf, struct flow6 *fl6) rte_pktmbuf_free(mbuf); return EDPVS_NOROUTE; } - mbuf->userdata = (void *)rt; + MBUF_USERDATA(mbuf, struct route6 *, MBUF_FIELD_ROUTE) = rt; dev = rt->rt6_dev; } @@ -868,7 +868,7 @@ uint16_t ip6_phdr_cksum(struct ip6_hdr *ip6h, uint64_t ol_flags, } /*FIXME: what if NEXTHDR_ROUTING is not the first exthdr? */ - csum = rte_ipv6_phdr_cksum((struct ipv6_hdr *)ip6h, ol_flags); + csum = rte_ipv6_phdr_cksum((struct rte_ipv6_hdr *)ip6h, ol_flags); /* restore original ip6h header */ ip6h->ip6_nxt = ip6nxt; @@ -905,7 +905,7 @@ uint16_t ip6_udptcp_cksum(struct ip6_hdr *ip6h, const void *l4_hdr, } /*FIXME: what if NEXTHDR_ROUTING is not the first exthdr? */ - csum = rte_ipv6_udptcp_cksum((struct ipv6_hdr *)ip6h, l4_hdr); + csum = rte_ipv6_udptcp_cksum((struct rte_ipv6_hdr *)ip6h, l4_hdr); /* restore original ip6h header */ ip6h->ip6_nxt = ip6nxt; diff --git a/src/ipv6/ipv6_exthdrs.c b/src/ipv6/ipv6_exthdrs.c index ac33d3231..6f35cc3f2 100644 --- a/src/ipv6/ipv6_exthdrs.c +++ b/src/ipv6/ipv6_exthdrs.c @@ -93,7 +93,7 @@ int ip6_skip_exthdr(const struct rte_mbuf *imbuf, int start, __u8 *nexthdrp) */ static int ip6_dummy_hdr_rcv(struct rte_mbuf *mbuf) { - struct ip6_hdr *hdr = mbuf->userdata; + struct ip6_hdr *hdr = MBUF_USERDATA(mbuf, struct ip6_hdr *, MBUF_FIELD_PROTO); struct ip6_ext *exthdr; if (mbuf_may_pull(mbuf, 8) != 0) diff --git a/src/ipv6/ndisc.c b/src/ipv6/ndisc.c index d5b6b4a35..41eff7e71 100644 --- a/src/ipv6/ndisc.c +++ b/src/ipv6/ndisc.c @@ -197,7 +197,7 @@ static struct rte_mbuf *ndisc_build_mbuf(struct netif_port *dev, { struct rte_mbuf *mbuf; struct icmp6_hdr *icmp6hdr; - struct ipv6_hdr iph; + struct rte_ipv6_hdr iph; int len; uint8_t *opt; @@ -211,7 +211,7 @@ static struct rte_mbuf *ndisc_build_mbuf(struct netif_port *dev, RTE_LOG(ERR, NEIGHBOUR, "mbuf_pool alloc failed\n"); return NULL; } - mbuf->userdata = NULL; + mbuf_userdata_reset(mbuf); icmp6hdr = (struct icmp6_hdr *)rte_pktmbuf_append(mbuf, sizeof(*icmp6h)); rte_memcpy(icmp6hdr, icmp6h, sizeof(*icmp6h)); @@ -356,8 +356,8 @@ static int ndisc_recv_ns(struct rte_mbuf *mbuf, struct netif_port *dev) int hashkey = 0; uint32_t ndoptlen = 0; - struct in6_addr *saddr = &((struct ip6_hdr *)mbuf->userdata)->ip6_src; - struct in6_addr *daddr = &((struct ip6_hdr *)mbuf->userdata)->ip6_dst; + struct in6_addr *saddr = &MBUF_USERDATA(mbuf, struct ip6_hdr *, MBUF_FIELD_PROTO)->ip6_src; + struct in6_addr *daddr = &MBUF_USERDATA(mbuf, struct ip6_hdr *, MBUF_FIELD_PROTO)->ip6_dst; struct nd_msg *msg = rte_pktmbuf_mtod(mbuf, struct nd_msg *); int dad = ipv6_addr_any(saddr); @@ -440,12 +440,12 @@ static int ndisc_recv_ns(struct rte_mbuf *mbuf, struct netif_port *dev) hashkey = neigh_hashkey(AF_INET6, (union inet_addr *)saddr, dev); neigh = neigh_lookup_entry(AF_INET6, (union inet_addr *)saddr, dev, hashkey); if (neigh && !(neigh->flag & NEIGHBOUR_STATIC)) { - neigh_edit(neigh, (struct ether_addr *)lladdr); + neigh_edit(neigh, (struct rte_ether_addr *)lladdr); neigh_entry_state_trans(neigh, 1); neigh_sync_core(neigh, 1, NEIGH_ENTRY); } else { neigh = neigh_add_table(AF_INET6, (union inet_addr *)saddr, - (struct ether_addr *)lladdr, dev, hashkey, 0); + (struct rte_ether_addr *)lladdr, dev, hashkey, 0); if (!neigh){ RTE_LOG(ERR, NEIGHBOUR, "[%s] add neighbour wrong\n", __func__); return EDPVS_NOMEM; @@ -468,12 +468,12 @@ static int ndisc_recv_na(struct rte_mbuf *mbuf, struct netif_port *dev) struct neighbour_entry *neigh; struct inet_ifaddr *ifa; int hashkey; - struct in6_addr *daddr = &((struct ip6_hdr *)mbuf->userdata)->ip6_dst; + struct in6_addr *daddr = &MBUF_USERDATA(mbuf, struct ip6_hdr *, MBUF_FIELD_PROTO)->ip6_dst; struct nd_msg *msg = rte_pktmbuf_mtod(mbuf, struct nd_msg *); uint32_t ndoptlen = mbuf->data_len - offsetof(struct nd_msg, opt); #ifdef CONFIG_NDISC_DEBUG - struct in6_addr *saddr = &((struct ip6_hdr *)mbuf->userdata)->ip6_src; + struct in6_addr *saddr = &MBUF_USERDATA(mbuf, struct ip6_hdr *, MBUF_FIELD_PROTO)->ip6_src; ndisc_show_addr(__func__, saddr, daddr); #endif @@ -526,12 +526,12 @@ static int ndisc_recv_na(struct rte_mbuf *mbuf, struct netif_port *dev) hashkey = neigh_hashkey(AF_INET6, (union inet_addr *)&msg->target, dev); neigh = neigh_lookup_entry(AF_INET6, (union inet_addr *)&msg->target, dev, hashkey); if (neigh && !(neigh->flag & NEIGHBOUR_STATIC)) { - neigh_edit(neigh, (struct ether_addr *)lladdr); + neigh_edit(neigh, (struct rte_ether_addr *)lladdr); neigh_entry_state_trans(neigh, 1); neigh_sync_core(neigh, 1, NEIGH_ENTRY); } else { neigh = neigh_add_table(AF_INET6, (union inet_addr *)&msg->target, - (struct ether_addr *)lladdr, dev, hashkey, 0); + (struct rte_ether_addr *)lladdr, dev, hashkey, 0); if (!neigh) { RTE_LOG(ERR, NEIGHBOUR, "[%s] add neighbour wrong\n", __func__); return EDPVS_NOMEM; @@ -548,7 +548,7 @@ int ndisc_rcv(struct rte_mbuf *mbuf, struct netif_port *dev) { struct nd_msg *msg; int ret; - struct ip6_hdr *ipv6_hdr = mbuf->userdata; + struct ip6_hdr *ipv6_hdr = MBUF_USERDATA(mbuf, struct ip6_hdr *, MBUF_FIELD_PROTO); if (mbuf_may_pull(mbuf, sizeof(struct icmp6_hdr)) != 0) { ret = EDPVS_NOMEM; diff --git a/src/ipv6/route6.c b/src/ipv6/route6.c index a5a7a1f11..d0ac2461e 100644 --- a/src/ipv6/route6.c +++ b/src/ipv6/route6.c @@ -137,7 +137,7 @@ static int rt6_setup_lcore(void *arg) tv.tv_sec = g_rt6_recycle_time, tv.tv_usec = 0, - global = (rte_lcore_id() == rte_get_master_lcore()); + global = (rte_lcore_id() == rte_get_main_lcore()); INIT_LIST_HEAD(&this_rt6_dustbin.routes); err = dpvs_timer_sched_period(&this_rt6_dustbin.tm, &tv, rt6_recycle, NULL, global); @@ -210,7 +210,7 @@ static int rt6_add_del(const struct dp_vs_route6_conf *cf) lcoreid_t cid; cid = rte_lcore_id(); - assert(cid == rte_get_master_lcore()); + assert(cid == rte_get_main_lcore()); /* for master */ switch (cf->ops) { @@ -412,8 +412,8 @@ int route6_init(void) return EDPVS_NOTEXIST; } - rte_eal_mp_remote_launch(rt6_setup_lcore, NULL, CALL_MASTER); - RTE_LCORE_FOREACH_SLAVE(cid) { + rte_eal_mp_remote_launch(rt6_setup_lcore, NULL, CALL_MAIN); + RTE_LCORE_FOREACH_WORKER(cid) { if ((err = rte_eal_wait_lcore(cid)) < 0) { RTE_LOG(ERR, RT6, "%s: fail to setup rt6 on lcore%d -- %s\n", __func__, cid, dpvs_strerror(err)); @@ -462,8 +462,8 @@ int route6_term(void) if (err != EDPVS_OK) RTE_LOG(WARNING, RT6, "%s:fail to unregister route6 msg!\n", __func__); - rte_eal_mp_remote_launch(rt6_destroy_lcore, NULL, CALL_MASTER); - RTE_LCORE_FOREACH_SLAVE(cid) { + rte_eal_mp_remote_launch(rt6_destroy_lcore, NULL, CALL_MAIN); + RTE_LCORE_FOREACH_WORKER(cid) { if ((err = rte_eal_wait_lcore(cid)) < 0) { RTE_LOG(WARNING, RT6, "%s: fail to destroy rt6 on lcore%d -- %s\n", __func__, cid, dpvs_strerror(err)); diff --git a/src/ipv6/route6_lpm.c b/src/ipv6/route6_lpm.c index c146d224f..7c0ce488b 100644 --- a/src/ipv6/route6_lpm.c +++ b/src/ipv6/route6_lpm.c @@ -118,7 +118,7 @@ static int rt6_lpm_setup_lcore(void *arg) .flags = 0, }; - if ((!(g_lcore_mask & (1<userdata = NULL; + MBUF_USERDATA(cloned_syn_mbuf, void *, MBUF_FIELD_ROUTE) = NULL; conn->packet_xmit(pp, conn, cloned_syn_mbuf); } } @@ -1476,7 +1476,7 @@ static int sockopt_conn_get_all(const struct ip_vs_conn_req *conn_req, } if ((conn_req->flag & GET_IPVS_CONN_FLAG_TEMPLATE) - && (cid == rte_get_master_lcore())) { /* persist conns */ + && (cid == rte_get_main_lcore())) { /* persist conns */ rte_spinlock_lock(&dp_vs_ct_lock); res = __lcore_conn_table_dump(dp_vs_ct_tbl); rte_spinlock_unlock(&dp_vs_ct_lock); @@ -1787,8 +1787,8 @@ int dp_vs_conn_init(void) * RTE_PER_LCORE() can only access own instances. * it make codes looks strange. */ - rte_eal_mp_remote_launch(conn_init_lcore, NULL, SKIP_MASTER); - RTE_LCORE_FOREACH_SLAVE(lcore) { + rte_eal_mp_remote_launch(conn_init_lcore, NULL, SKIP_MAIN); + RTE_LCORE_FOREACH_WORKER(lcore) { if ((err = rte_eal_wait_lcore(lcore)) < 0) { RTE_LOG(WARNING, IPVS, "%s: lcore %d: %s.\n", __func__, lcore, dpvs_strerror(err)); @@ -1827,8 +1827,8 @@ int dp_vs_conn_term(void) /* no API opposite to rte_mempool_create() */ - rte_eal_mp_remote_launch(conn_term_lcore, NULL, SKIP_MASTER); - RTE_LCORE_FOREACH_SLAVE(lcore) { + rte_eal_mp_remote_launch(conn_term_lcore, NULL, SKIP_MAIN); + RTE_LCORE_FOREACH_WORKER(lcore) { rte_eal_wait_lcore(lcore); } diff --git a/src/ipvs/ip_vs_core.c b/src/ipvs/ip_vs_core.c index 4a75a8de9..63b09ab9a 100644 --- a/src/ipvs/ip_vs_core.c +++ b/src/ipvs/ip_vs_core.c @@ -43,7 +43,7 @@ static inline int dp_vs_fill_iphdr(int af, struct rte_mbuf *mbuf, struct dp_vs_iphdr *iph) { if (af == AF_INET) { - struct ipv4_hdr *ip4h = ip4_hdr(mbuf); + struct rte_ipv4_hdr *ip4h = ip4_hdr(mbuf); iph->af = AF_INET; iph->len = ip4_hdrlen(mbuf); iph->proto = ip4h->next_proto_id; @@ -425,7 +425,7 @@ static int __xmit_outbound_icmp4(struct rte_mbuf *mbuf, { struct flow4 fl4; struct route_entry *rt = NULL; - struct ipv4_hdr *iph = ip4_hdr(mbuf); + struct rte_ipv4_hdr *iph = ip4_hdr(mbuf); /* no translation needed for DR/TUN. */ if (conn->dest->fwdmode != DPVS_FWD_MODE_FNAT && @@ -451,7 +451,7 @@ static int __xmit_outbound_icmp4(struct rte_mbuf *mbuf, } if ((mbuf->pkt_len > rt->mtu) - && (ip4_hdr(mbuf)->fragment_offset & IPV4_HDR_DF_FLAG)) { + && (ip4_hdr(mbuf)->fragment_offset & RTE_IPV4_HDR_DF_FLAG)) { route4_put(rt); icmp_send(mbuf, ICMP_DEST_UNREACH, ICMP_UNREACH_NEEDFRAG, htonl(rt->mtu)); @@ -459,9 +459,9 @@ static int __xmit_outbound_icmp4(struct rte_mbuf *mbuf, return EDPVS_FRAG; } - if (unlikely(mbuf->userdata != NULL)) - route4_put((struct route_entry *)mbuf->userdata); - mbuf->userdata = rt; + if (unlikely(MBUF_USERDATA(mbuf, struct route_entry *, MBUF_FIELD_ROUTE) != NULL)) + route4_put(MBUF_USERDATA(mbuf, struct route_entry *, MBUF_FIELD_ROUTE)); + MBUF_USERDATA(mbuf, struct route_entry *, MBUF_FIELD_ROUTE) = rt; /* translation for outer L3, ICMP, and inner L3 and L4 */ dp_vs_xmit_icmp(mbuf, prot, conn, DPVS_CONN_DIR_OUTBOUND); @@ -507,9 +507,9 @@ static int __xmit_outbound_icmp6(struct rte_mbuf *mbuf, return EDPVS_FRAG; } - if (unlikely(mbuf->userdata != NULL)) - route6_put((struct route6 *)mbuf->userdata); - mbuf->userdata = rt6; + if (unlikely(MBUF_USERDATA(mbuf, struct route6 *, MBUF_FIELD_ROUTE) != NULL)) + route6_put(MBUF_USERDATA(mbuf, struct route6 *, MBUF_FIELD_ROUTE)); + MBUF_USERDATA(mbuf, struct route6 *, MBUF_FIELD_ROUTE) = rt6; /* translation for outer L3, ICMP, and inner L3 and L4 */ dp_vs_xmit_icmp(mbuf, prot, conn, DPVS_CONN_DIR_OUTBOUND); @@ -538,7 +538,7 @@ static int __xmit_inbound_icmp4(struct rte_mbuf *mbuf, { struct flow4 fl4; struct route_entry *rt = NULL; - struct ipv4_hdr *iph = ip4_hdr(mbuf); + struct rte_ipv4_hdr *iph = ip4_hdr(mbuf); /* no translation needed for DR/TUN. */ if (conn->dest->fwdmode != DPVS_FWD_MODE_NAT && @@ -564,7 +564,7 @@ static int __xmit_inbound_icmp4(struct rte_mbuf *mbuf, } if ((mbuf->pkt_len > rt->mtu) - && (ip4_hdr(mbuf)->fragment_offset & IPV4_HDR_DF_FLAG)) { + && (ip4_hdr(mbuf)->fragment_offset & RTE_IPV4_HDR_DF_FLAG)) { route4_put(rt); icmp_send(mbuf, ICMP_DEST_UNREACH, ICMP_UNREACH_NEEDFRAG, htonl(rt->mtu)); @@ -572,9 +572,9 @@ static int __xmit_inbound_icmp4(struct rte_mbuf *mbuf, return EDPVS_FRAG; } - if (unlikely(mbuf->userdata != NULL)) - route4_put((struct route_entry *)mbuf->userdata); - mbuf->userdata = rt; + if (unlikely(MBUF_USERDATA(mbuf, struct route_entry *, MBUF_FIELD_ROUTE) != NULL)) + route4_put(MBUF_USERDATA(mbuf, struct route_entry *, MBUF_FIELD_ROUTE)); + MBUF_USERDATA(mbuf, struct route_entry *, MBUF_FIELD_ROUTE) = rt; /* translation for outer L3, ICMP, and inner L3 and L4 */ dp_vs_xmit_icmp(mbuf, prot, conn, DPVS_CONN_DIR_INBOUND); @@ -621,9 +621,9 @@ static int __xmit_inbound_icmp6(struct rte_mbuf *mbuf, return EDPVS_FRAG; } - if (unlikely(mbuf->userdata != NULL)) - route6_put((struct route6 *)mbuf->userdata); - mbuf->userdata = rt6; + if (unlikely(MBUF_USERDATA(mbuf, struct route6 *, MBUF_FIELD_ROUTE) != NULL)) + route6_put(MBUF_USERDATA(mbuf, struct route6 *, MBUF_FIELD_ROUTE)); + MBUF_USERDATA(mbuf, struct route6 *, MBUF_FIELD_ROUTE) = rt6; /* translation for outer L3, ICMP, and inner L3 and L4 */ dp_vs_xmit_icmp(mbuf, prot, conn, DPVS_CONN_DIR_INBOUND); @@ -650,8 +650,8 @@ static int xmit_inbound_icmp(struct rte_mbuf *mbuf, static int __dp_vs_in_icmp4(struct rte_mbuf *mbuf, int *related) { struct icmphdr *ich, _icmph; - struct ipv4_hdr *iph = ip4_hdr(mbuf); - struct ipv4_hdr *ciph, _ciph; + struct rte_ipv4_hdr *iph = ip4_hdr(mbuf); + struct rte_ipv4_hdr *ciph, _ciph; struct dp_vs_iphdr dciph; struct dp_vs_proto *prot; struct dp_vs_conn *conn; @@ -696,7 +696,7 @@ static int __dp_vs_in_icmp4(struct rte_mbuf *mbuf, int *related) if (!prot) return INET_ACCEPT; - if (unlikely((ciph->fragment_offset & htons(IPV4_HDR_OFFSET_MASK)))) { + if (unlikely((ciph->fragment_offset & htons(RTE_IPV4_HDR_OFFSET_MASK)))) { RTE_LOG(WARNING, IPVS, "%s: frag needed.\n", __func__); return INET_DROP; } @@ -707,7 +707,7 @@ static int __dp_vs_in_icmp4(struct rte_mbuf *mbuf, int *related) * and restore it later. although it looks strange. */ rte_pktmbuf_adj(mbuf, off); - if (mbuf_may_pull(mbuf, sizeof(struct ipv4_hdr)) != 0) + if (mbuf_may_pull(mbuf, sizeof(struct rte_ipv4_hdr)) != 0) return INET_DROP; dp_vs_fill_iphdr(AF_INET, mbuf, &dciph); @@ -719,7 +719,7 @@ static int __dp_vs_in_icmp4(struct rte_mbuf *mbuf, int *related) */ if (cid != peer_cid) { /* recover mbuf.data_off to outer Ether header */ - rte_pktmbuf_prepend(mbuf, (uint16_t)sizeof(struct ether_hdr) + off); + rte_pktmbuf_prepend(mbuf, (uint16_t)sizeof(struct rte_ether_hdr) + off); return dp_vs_redirect_pkt(mbuf, peer_cid); } @@ -862,7 +862,7 @@ static int __dp_vs_in_icmp6(struct rte_mbuf *mbuf, int *related) */ if (cid != peer_cid) { /* recover mbuf.data_off to outer Ether header */ - rte_pktmbuf_prepend(mbuf, (uint16_t)sizeof(struct ether_hdr) + off); + rte_pktmbuf_prepend(mbuf, (uint16_t)sizeof(struct rte_ether_hdr) + off); return dp_vs_redirect_pkt(mbuf, peer_cid); } @@ -995,7 +995,7 @@ static int __dp_vs_in(void *priv, struct rte_mbuf *mbuf, */ if (cid != peer_cid) { /* recover mbuf.data_off to outer Ether header */ - rte_pktmbuf_prepend(mbuf, (uint16_t)sizeof(struct ether_hdr)); + rte_pktmbuf_prepend(mbuf, (uint16_t)sizeof(struct rte_ether_hdr)); return dp_vs_redirect_pkt(mbuf, peer_cid); } diff --git a/src/ipvs/ip_vs_dest.c b/src/ipvs/ip_vs_dest.c index e75a3c3b7..98d24be59 100644 --- a/src/ipvs/ip_vs_dest.c +++ b/src/ipvs/ip_vs_dest.c @@ -63,7 +63,7 @@ static void __dp_vs_dest_update(struct dp_vs_service *svc, if (udest->max_conn == 0 || udest->max_conn > dest->max_conn) dest->flags &= ~DPVS_DEST_F_OVERLOAD; - if (rte_lcore_id() != rte_get_master_lcore()) { + if (rte_lcore_id() != rte_get_main_lcore()) { dest->max_conn = udest->max_conn / num_lcores; dest->min_conn = udest->min_conn / num_lcores; } else { diff --git a/src/ipvs/ip_vs_laddr.c b/src/ipvs/ip_vs_laddr.c index 543bce257..9d4cad725 100644 --- a/src/ipvs/ip_vs_laddr.c +++ b/src/ipvs/ip_vs_laddr.c @@ -439,7 +439,7 @@ static int laddr_sockopt_set(sockoptid_t opt, const void *conf, size_t size) lcoreid_t cid = rte_lcore_id(); // send to slave core - if (cid == rte_get_master_lcore()) { + if (cid == rte_get_main_lcore()) { struct dpvs_msg *msg; msg = msg_make(set_opt_so2msg(opt), laddr_msg_seq(), DPVS_MSG_MULTICAST, cid, size, conf); @@ -608,7 +608,7 @@ static int laddr_sockopt_get(sockoptid_t opt, const void *conf, size_t size, return EDPVS_MSG_FAIL; } - if (cid == rte_get_master_lcore()) { + if (cid == rte_get_main_lcore()) { if (dp_vs_match_parse(laddr_conf->srange, laddr_conf->drange, laddr_conf->iifname, laddr_conf->oifname, laddr_conf->af_s, &match) != EDPVS_OK) { diff --git a/src/ipvs/ip_vs_nat64.c b/src/ipvs/ip_vs_nat64.c index 7ea4d637e..e9a827b60 100644 --- a/src/ipvs/ip_vs_nat64.c +++ b/src/ipvs/ip_vs_nat64.c @@ -25,7 +25,7 @@ int mbuf_6to4(struct rte_mbuf *mbuf, const struct in_addr *daddr) { struct ip6_hdr *ip6h = ip6_hdr(mbuf); - struct ipv4_hdr *ip4h; + struct rte_ipv4_hdr *ip4h; uint8_t next_prot; uint8_t ttl; @@ -43,14 +43,14 @@ int mbuf_6to4(struct rte_mbuf *mbuf, next_prot = ip6h->ip6_nxt; ttl = ip6h->ip6_hlim; - ip4h = (struct ipv4_hdr *)rte_pktmbuf_prepend(mbuf, sizeof(struct ipv4_hdr)); + ip4h = (struct rte_ipv4_hdr *)rte_pktmbuf_prepend(mbuf, sizeof(struct rte_ipv4_hdr)); if (!ip4h) return EDPVS_NOROOM; ip4h->version_ihl = ((4 << 4) | 5); ip4h->type_of_service = 0; ip4h->total_length = htons(mbuf->pkt_len); - ip4h->fragment_offset = htons(IPV4_HDR_DF_FLAG); + ip4h->fragment_offset = htons(RTE_IPV4_HDR_DF_FLAG); ip4h->time_to_live = ttl; ip4h->next_proto_id = next_prot; ip4h->hdr_checksum = 0; @@ -58,7 +58,7 @@ int mbuf_6to4(struct rte_mbuf *mbuf, ip4h->dst_addr = daddr->s_addr; ip4h->packet_id = 0; // NO FRAG, so 0 is OK? - mbuf->l3_len = sizeof(struct ipv4_hdr); + mbuf->l3_len = sizeof(struct rte_ipv4_hdr); return EDPVS_OK; } @@ -67,13 +67,13 @@ int mbuf_4to6(struct rte_mbuf *mbuf, const struct in6_addr *saddr, const struct in6_addr *daddr) { - struct ipv4_hdr *ip4h = ip4_hdr(mbuf); + struct rte_ipv4_hdr *ip4h = ip4_hdr(mbuf); struct ip6_hdr *ip6h; uint16_t plen; uint8_t hops; uint8_t next_prot; - if (mbuf->l3_len != sizeof(struct ipv4_hdr)) { + if (mbuf->l3_len != sizeof(struct rte_ipv4_hdr)) { return EDPVS_NOTSUPP; } if (rte_pktmbuf_adj(mbuf, mbuf->l3_len) == NULL) diff --git a/src/ipvs/ip_vs_proto_tcp.c b/src/ipvs/ip_vs_proto_tcp.c index 361d435b0..863746c67 100644 --- a/src/ipvs/ip_vs_proto_tcp.c +++ b/src/ipvs/ip_vs_proto_tcp.c @@ -139,7 +139,7 @@ inline struct tcphdr *tcp_hdr(const struct rte_mbuf *mbuf) * @th: pointer to the beginning of the L4 header * @return void */ -inline void tcp4_send_csum(struct ipv4_hdr *iph, struct tcphdr *th) +inline void tcp4_send_csum(struct rte_ipv4_hdr *iph, struct tcphdr *th) { th->check = 0; th->check = rte_ipv4_udptcp_cksum(iph, th); @@ -151,7 +151,7 @@ inline void tcp4_send_csum(struct ipv4_hdr *iph, struct tcphdr *th) * @th: pointer to the beginning of the L4 header * @return void */ -inline void tcp6_send_csum(struct ipv6_hdr *iph, struct tcphdr *th) { +inline void tcp6_send_csum(struct rte_ipv6_hdr *iph, struct tcphdr *th) { th->check = 0; th->check = ip6_udptcp_cksum((struct ip6_hdr *)iph, th, (void *)th - (void *)iph, IPPROTO_TCP); @@ -165,7 +165,7 @@ static inline int tcp_send_csum(int af, int iphdrlen, struct tcphdr *th, struct netif_port *dev = NULL; if (AF_INET6 == af) { - struct route6 *rt6 = mbuf->userdata; + struct route6 *rt6 = MBUF_USERDATA(mbuf, struct route6 *, MBUF_FIELD_ROUTE); struct ip6_hdr *ip6h = ip6_hdr(mbuf); if (rt6 && rt6->rt6_dev) dev = rt6->rt6_dev; @@ -179,11 +179,11 @@ static inline int tcp_send_csum(int af, int iphdrlen, struct tcphdr *th, } else { if (mbuf_may_pull(mbuf, mbuf->pkt_len) != 0) return EDPVS_INVPKT; - tcp6_send_csum((struct ipv6_hdr *)ip6h, th); + tcp6_send_csum((struct rte_ipv6_hdr *)ip6h, th); } } else { /* AF_INET */ - struct route_entry *rt = mbuf->userdata; - struct ipv4_hdr *iph = ip4_hdr(mbuf); + struct route_entry *rt = MBUF_USERDATA(mbuf, struct route_entry *, MBUF_FIELD_ROUTE); + struct rte_ipv4_hdr *iph = ip4_hdr(mbuf); if (rt && rt->port) dev = rt->port; else if (conn->out_dev) @@ -319,9 +319,11 @@ static inline int tcp_in_add_toa(struct dp_vs_conn *conn, struct rte_mbuf *mbuf, * check if we can add the new option */ /* skb length and tcp option length checking */ - if (tuplehash_out(conn).af == AF_INET && (rt = mbuf->userdata) != NULL) { + if (tuplehash_out(conn).af == AF_INET && (rt = MBUF_USERDATA(mbuf, + struct route_entry *, MBUF_FIELD_ROUTE)) != NULL) { mtu = rt->mtu; - } else if (tuplehash_out(conn).af == AF_INET6 && (rt6 = mbuf->userdata) != NULL) { + } else if (tuplehash_out(conn).af == AF_INET6 && (rt6 = MBUF_USERDATA(mbuf, + struct route6 *, MBUF_FIELD_ROUTE)) != NULL) { mtu = rt6->rt6_mtu; } else if (conn->in_dev) { /* no route for fast-xmit */ mtu = conn->in_dev->mtu; @@ -990,7 +992,7 @@ static int tcp_send_rst(struct dp_vs_proto *proto, struct rte_mempool *pool; struct rte_mbuf *mbuf = NULL; struct tcphdr *th; - struct ipv4_hdr *ip4h; + struct rte_ipv4_hdr *ip4h; struct ip6_hdr *ip6h; if (conn->state != DPVS_TCP_S_ESTABLISHED) { @@ -1005,7 +1007,7 @@ static int tcp_send_rst(struct dp_vs_proto *proto, mbuf = rte_pktmbuf_alloc(pool); if (!mbuf) return EDPVS_NOMEM; - mbuf->userdata = NULL; /* make sure "no route info" */ + mbuf_userdata_reset(mbuf); /* make sure "no route info" */ /* * reserve head room ? @@ -1041,8 +1043,8 @@ static int tcp_send_rst(struct dp_vs_proto *proto, /* IP header (before translation) */ if (dir == DPVS_CONN_DIR_INBOUND) { if (tuplehash_in(conn).af == AF_INET) { - ip4h = (struct ipv4_hdr *)rte_pktmbuf_prepend(mbuf, - sizeof(struct ipv4_hdr)); + ip4h = (struct rte_ipv4_hdr *)rte_pktmbuf_prepend(mbuf, + sizeof(struct rte_ipv4_hdr)); if (!ip4h) { rte_pktmbuf_free(mbuf); return EDPVS_NOROOM; @@ -1050,7 +1052,7 @@ static int tcp_send_rst(struct dp_vs_proto *proto, ip4h->version_ihl = 0x45; ip4h->total_length = htons(mbuf->pkt_len); ip4h->packet_id = 0; - ip4h->fragment_offset = htons(IPV4_HDR_DF_FLAG); + ip4h->fragment_offset = htons(RTE_IPV4_HDR_DF_FLAG); ip4h->time_to_live = 64; ip4h->next_proto_id = IPPROTO_TCP; ip4h->src_addr = conn->caddr.in.s_addr; @@ -1079,15 +1081,15 @@ static int tcp_send_rst(struct dp_vs_proto *proto, mbuf->l3_len = sizeof(*ip6h); - tcp6_send_csum((struct ipv6_hdr *)ip6h, th); + tcp6_send_csum((struct rte_ipv6_hdr *)ip6h, th); } conn->packet_xmit(proto, conn, mbuf); } else { if (tuplehash_out(conn).af == AF_INET) { - ip4h = (struct ipv4_hdr *)rte_pktmbuf_prepend(mbuf, - sizeof(struct ipv4_hdr)); + ip4h = (struct rte_ipv4_hdr *)rte_pktmbuf_prepend(mbuf, + sizeof(struct rte_ipv4_hdr)); if (!ip4h) { rte_pktmbuf_free(mbuf); return EDPVS_NOROOM; @@ -1095,7 +1097,7 @@ static int tcp_send_rst(struct dp_vs_proto *proto, ip4h->version_ihl = 0x45; ip4h->total_length = htons(mbuf->pkt_len); ip4h->packet_id = 0; - ip4h->fragment_offset = htons(IPV4_HDR_DF_FLAG); + ip4h->fragment_offset = htons(RTE_IPV4_HDR_DF_FLAG); ip4h->time_to_live = 64; ip4h->next_proto_id = IPPROTO_TCP; ip4h->src_addr = conn->daddr.in.s_addr; @@ -1124,7 +1126,7 @@ static int tcp_send_rst(struct dp_vs_proto *proto, mbuf->l3_len = sizeof(*ip6h); - tcp6_send_csum((struct ipv6_hdr *)ip6h, th); + tcp6_send_csum((struct rte_ipv6_hdr *)ip6h, th); } conn->packet_out_xmit(proto, conn, mbuf); diff --git a/src/ipvs/ip_vs_proto_udp.c b/src/ipvs/ip_vs_proto_udp.c index e098d2774..b868d47e9 100644 --- a/src/ipvs/ip_vs_proto_udp.c +++ b/src/ipvs/ip_vs_proto_udp.c @@ -63,20 +63,20 @@ static int udp_timeouts[DPVS_UDP_S_LAST + 1] = { [DPVS_UDP_S_LAST] = 2, }; -inline void udp4_send_csum(struct ipv4_hdr *iph, struct udp_hdr *uh) +inline void udp4_send_csum(struct rte_ipv4_hdr *iph, struct rte_udp_hdr *uh) { uh->dgram_cksum = 0; uh->dgram_cksum = rte_ipv4_udptcp_cksum(iph, uh); } -inline void udp6_send_csum(struct ipv6_hdr *iph, struct udp_hdr *uh) +inline void udp6_send_csum(struct rte_ipv6_hdr *iph, struct rte_udp_hdr *uh) { uh->dgram_cksum = 0; uh->dgram_cksum = ip6_udptcp_cksum((struct ip6_hdr *)iph, (struct udphdr *)uh, (void *)uh - (void *)iph, IPPROTO_UDP); } -static inline int udp_send_csum(int af, int iphdrlen, struct udp_hdr *uh, +static inline int udp_send_csum(int af, int iphdrlen, struct rte_udp_hdr *uh, const struct dp_vs_conn *conn, struct rte_mbuf *mbuf, const struct opphdr *opp) { @@ -88,28 +88,28 @@ static inline int udp_send_csum(int af, int iphdrlen, struct udp_hdr *uh, /* UDP checksum is mandatory for IPv6.[RFC 2460] */ struct ip6_hdr *ip6h = ip6_hdr(mbuf); if (unlikely(opp != NULL)) { - udp6_send_csum((struct ipv6_hdr*)ip6h, uh); + udp6_send_csum((struct rte_ipv6_hdr*)ip6h, uh); } else { - struct route6 *rt6 = mbuf->userdata; + struct route6 *rt6 = MBUF_USERDATA(mbuf, struct route6 *, MBUF_FIELD_ROUTE); if (rt6 && rt6->rt6_dev) dev = rt6->rt6_dev; else if (conn->out_dev) dev = conn->out_dev; if (likely(dev && (dev->flag & NETIF_PORT_FLAG_TX_UDP_CSUM_OFFLOAD))) { mbuf->l3_len = iphdrlen; - mbuf->l4_len = sizeof(struct udp_hdr); + mbuf->l4_len = sizeof(struct rte_udp_hdr); mbuf->ol_flags |= (PKT_TX_UDP_CKSUM | PKT_TX_IPV6); uh->dgram_cksum = ip6_phdr_cksum(ip6h, mbuf->ol_flags, iphdrlen, IPPROTO_UDP); } else { if (mbuf_may_pull(mbuf, mbuf->pkt_len) != 0) return EDPVS_INVPKT; - udp6_send_csum((struct ipv6_hdr*)ip6h, uh); + udp6_send_csum((struct rte_ipv6_hdr*)ip6h, uh); } } } else { /* AF_INET */ /* UDP checksum is not mandatory for IPv4. */ - struct ipv4_hdr *iph = ip4_hdr(mbuf); + struct rte_ipv4_hdr *iph = ip4_hdr(mbuf); if (unlikely(opp != NULL)) { /* * XXX: UDP pseudo header need UDP length, but the common helper function @@ -123,14 +123,14 @@ static inline int udp_send_csum(int af, int iphdrlen, struct udp_hdr *uh, */ uh->dgram_cksum = 0; } else { - struct route_entry *rt = mbuf->userdata; + struct route_entry *rt = MBUF_USERDATA(mbuf, struct route_entry *, MBUF_FIELD_ROUTE); if (rt && rt->port) dev = rt->port; else if (conn->out_dev) dev = conn->out_dev; if (likely(dev && (dev->flag & NETIF_PORT_FLAG_TX_UDP_CSUM_OFFLOAD))) { mbuf->l3_len = iphdrlen; - mbuf->l4_len = sizeof(struct udp_hdr); + mbuf->l4_len = sizeof(struct rte_udp_hdr); mbuf->ol_flags |= (PKT_TX_UDP_CKSUM | PKT_TX_IP_CKSUM | PKT_TX_IPV4); uh->dgram_cksum = rte_ipv4_phdr_cksum(iph, mbuf->ol_flags); } else { @@ -149,7 +149,7 @@ static int udp_conn_sched(struct dp_vs_proto *proto, struct dp_vs_conn **conn, int *verdict) { - struct udp_hdr *uh, _udph; + struct rte_udp_hdr *uh, _udph; struct dp_vs_service *svc; bool outwall = false; assert(proto && iph && mbuf && conn && verdict); @@ -199,7 +199,7 @@ udp_conn_lookup(struct dp_vs_proto *proto, struct rte_mbuf *mbuf, int *direct, bool reverse, bool *drop, lcoreid_t *peer_cid) { - struct udp_hdr *uh, _udph; + struct rte_udp_hdr *uh, _udph; struct dp_vs_conn *conn; assert(proto && iph && mbuf); @@ -285,7 +285,8 @@ static int send_standalone_uoa(const struct dp_vs_conn *conn, int iaf = tuplehash_in(conn).af; int oaf = tuplehash_out(conn).af; - assert(conn && ombuf && oiph && ouh && ombuf->userdata); + assert(conn && ombuf && oiph && ouh && + MBUF_USERDATA_CONST(ombuf, void *, MBUF_FIELD_ROUTE)); /* just in case */ if (unlikely(conn->dest->fwdmode != DPVS_FWD_MODE_FNAT)) @@ -294,7 +295,7 @@ static int send_standalone_uoa(const struct dp_vs_conn *conn, mbuf = rte_pktmbuf_alloc(ombuf->pool); if (unlikely(!mbuf)) return EDPVS_NOMEM; - mbuf->userdata = NULL; + MBUF_USERDATA(mbuf, void *, MBUF_FIELD_ROUTE) = NULL; int ipolen_uoa = (AF_INET6 == iaf) ? IPOLEN_UOA_IPV6 : IPOLEN_UOA_IPV4; @@ -315,7 +316,7 @@ static int send_standalone_uoa(const struct dp_vs_conn *conn, goto no_room; ((struct iphdr *)iph)->version = 4; ((struct iphdr *)iph)->tos = ((struct iphdr *)oiph)->tos; - ((struct iphdr *)iph)->id = ip4_select_id((struct ipv4_hdr *)iph); + ((struct iphdr *)iph)->id = ip4_select_id((struct rte_ipv4_hdr *)iph); ((struct iphdr *)iph)->frag_off = 0; ((struct iphdr *)iph)->ttl = ((struct iphdr *)oiph)->ttl; ((struct iphdr *)iph)->saddr = conn->laddr.in.s_addr; @@ -397,14 +398,16 @@ static int send_standalone_uoa(const struct dp_vs_conn *conn, * if udp checksum error here, may cause tcpdump & uoa moudule parse packets * correctly, however socket can not receive L4 data. */ - udp6_send_csum((struct ipv6_hdr *)iph, (struct udp_hdr*)uh); - mbuf->userdata = rt6 = (struct route6*)ombuf->userdata; + udp6_send_csum((struct rte_ipv6_hdr *)iph, (struct rte_udp_hdr*)uh); + rt6 = MBUF_USERDATA_CONST(ombuf, struct route6 *, MBUF_FIELD_ROUTE); + MBUF_USERDATA(mbuf, struct route6 *, MBUF_FIELD_ROUTE) = rt6; route6_get(rt6); return ip6_local_out(mbuf); } else { /* IPv4 */ struct route_entry *rt; uh->check = 0; /* rte_ipv4_udptcp_cksum fails if opp inserted. */ - mbuf->userdata = rt = (struct route_entry *)ombuf->userdata; + rt = MBUF_USERDATA_CONST(ombuf, struct route_entry *, MBUF_FIELD_ROUTE); + MBUF_USERDATA(mbuf, struct route_entry *, MBUF_FIELD_ROUTE) = rt; route4_get(rt); return ipv4_local_out(mbuf); } @@ -501,7 +504,7 @@ static int insert_opp_uoa(struct dp_vs_conn *conn, struct rte_mbuf *mbuf, * basic header length (40 B) + payload length(including ext header) */ iphdrlen = ip6_hdrlen(mbuf); - if (iphdrlen != sizeof(struct ipv6_hdr)) + if (iphdrlen != sizeof(struct rte_ipv6_hdr)) goto standalone_uoa; iptot_len = sizeof(struct ip6_hdr) + ntohs(((struct ip6_hdr *)iph)->ip6_plen); @@ -618,7 +621,7 @@ static int udp_insert_uoa(struct dp_vs_conn *conn, struct rte_mbuf *mbuf, return EDPVS_OK; } - rt = mbuf->userdata; + rt = MBUF_USERDATA(mbuf, void *, MBUF_FIELD_ROUTE); if (!rt) { RTE_LOG(ERR, IPVS, "%s: no route\n", __func__); return EDPVS_INVPKT; @@ -677,7 +680,7 @@ static int udp_fnat_in_handler(struct dp_vs_proto *proto, struct dp_vs_conn *conn, struct rte_mbuf *mbuf) { - struct udp_hdr *uh = NULL; + struct rte_udp_hdr *uh = NULL; struct opphdr *opp = NULL; void *iph = NULL; /* af/mbuf may be changed for nat64 which in af is ipv6 and out is ipv4 */ @@ -699,14 +702,14 @@ static int udp_fnat_in_handler(struct dp_vs_proto *proto, } /* cannot use mbuf_header_pointer() */ - if (unlikely(mbuf->data_len < iphdrlen + sizeof(struct udp_hdr))) + if (unlikely(mbuf->data_len < iphdrlen + sizeof(struct rte_udp_hdr))) return EDPVS_INVPKT; if (nxt_proto == IPPROTO_UDP) { - uh = (struct udp_hdr *)(iph + iphdrlen); + uh = (struct rte_udp_hdr *)(iph + iphdrlen); } else if (nxt_proto == IPPROTO_OPT) { opp = (struct opphdr *)(iph + iphdrlen); - uh = (struct udp_hdr *)((void *)opp + ntohs(opp->length)); + uh = (struct rte_udp_hdr *)((void *)opp + ntohs(opp->length)); } if (unlikely(!uh)) @@ -722,15 +725,15 @@ static int udp_fnat_out_handler(struct dp_vs_proto *proto, struct dp_vs_conn *conn, struct rte_mbuf *mbuf) { - struct udp_hdr *uh; + struct rte_udp_hdr *uh; /* af/mbuf may be changed for nat64 which in af is ipv6 and out is ipv4 */ int af = tuplehash_in(conn).af; int iphdrlen = ((AF_INET6 == af) ? ip6_hdrlen(mbuf): ip4_hdrlen(mbuf)); /* cannot use mbuf_header_pointer() */ - if (unlikely(mbuf->data_len < iphdrlen + sizeof(struct udp_hdr))) + if (unlikely(mbuf->data_len < iphdrlen + sizeof(struct rte_udp_hdr))) return EDPVS_INVPKT; - uh = rte_pktmbuf_mtod_offset(mbuf, struct udp_hdr *, iphdrlen); + uh = rte_pktmbuf_mtod_offset(mbuf, struct rte_udp_hdr *, iphdrlen); if (unlikely(!uh)) return EDPVS_INVPKT; @@ -756,14 +759,14 @@ static int udp_snat_in_handler(struct dp_vs_proto *proto, struct dp_vs_conn *conn, struct rte_mbuf *mbuf) { - struct udp_hdr *uh; + struct rte_udp_hdr *uh; int af = conn->af; int iphdrlen = ((AF_INET6 == af) ? ip6_hdrlen(mbuf): ip4_hdrlen(mbuf)); /* cannot use mbuf_header_pointer() */ - if (unlikely(mbuf->data_len < iphdrlen + sizeof(struct udp_hdr))) + if (unlikely(mbuf->data_len < iphdrlen + sizeof(struct rte_udp_hdr))) return EDPVS_INVPKT; - uh = rte_pktmbuf_mtod_offset(mbuf, struct udp_hdr *, iphdrlen); + uh = rte_pktmbuf_mtod_offset(mbuf, struct rte_udp_hdr *, iphdrlen); if (unlikely(!uh)) return EDPVS_INVPKT; @@ -776,14 +779,14 @@ static int udp_snat_out_handler(struct dp_vs_proto *proto, struct dp_vs_conn *conn, struct rte_mbuf *mbuf) { - struct udp_hdr *uh; + struct rte_udp_hdr *uh; int af = conn->af; int iphdrlen = ((AF_INET6 == af) ? ip6_hdrlen(mbuf): ip4_hdrlen(mbuf)); /* cannot use mbuf_header_pointer() */ - if (unlikely(mbuf->data_len < iphdrlen + sizeof(struct udp_hdr))) + if (unlikely(mbuf->data_len < iphdrlen + sizeof(struct rte_udp_hdr))) return EDPVS_INVPKT; - uh = rte_pktmbuf_mtod_offset(mbuf, struct udp_hdr *, iphdrlen); + uh = rte_pktmbuf_mtod_offset(mbuf, struct rte_udp_hdr *, iphdrlen); if (unlikely(!uh)) return EDPVS_INVPKT; diff --git a/src/ipvs/ip_vs_rr.c b/src/ipvs/ip_vs_rr.c index 469f841fd..0fa1b6558 100644 --- a/src/ipvs/ip_vs_rr.c +++ b/src/ipvs/ip_vs_rr.c @@ -20,15 +20,15 @@ static int dp_vs_rr_init_svc(struct dp_vs_service *svc) { - svc->sched_data = &svc->dests; + svc->sched_data = dp_vs_sched_first_dest(svc); + return EDPVS_OK; } static int dp_vs_rr_update_svc(struct dp_vs_service *svc, struct dp_vs_dest *dest __rte_unused, sockoptid_t opt __rte_unused) { - svc->sched_data = &svc->dests; - return EDPVS_OK; + return dp_vs_rr_init_svc(svc); } /* @@ -68,7 +68,6 @@ static struct dp_vs_dest *dp_vs_rr_schedule(struct dp_vs_service *svc, static struct dp_vs_scheduler dp_vs_rr_scheduler = { .name = "rr", /* name */ -// .refcnt = ATOMIC_INIT(0), .n_list = LIST_HEAD_INIT(dp_vs_rr_scheduler.n_list), .init_service = dp_vs_rr_init_svc, .update_service = dp_vs_rr_update_svc, diff --git a/src/ipvs/ip_vs_sched.c b/src/ipvs/ip_vs_sched.c index 0cd37d6cc..f84321e78 100644 --- a/src/ipvs/ip_vs_sched.c +++ b/src/ipvs/ip_vs_sched.c @@ -119,6 +119,29 @@ int dp_vs_gcd_weight(struct dp_vs_service *svc) return g ? g : 1; } +/* + * Different workers should start schedule algorith from the dests that are evenly distributed + * across the whole dest list. It can avoid the clustering of connections across dests on the + * early phase after the service setup, especially for such scheduling methods as rr/wrr/wlc. + */ +struct list_head * dp_vs_sched_first_dest(const struct dp_vs_service *svc) +{ + int i, cid, loc; + struct list_head *ini; + + cid = rte_lcore_id(); + ini = svc->dests.next; + loc = (svc->num_dests / g_slave_lcore_num ?: 1) * g_lcore_index[cid] % (svc->num_dests ?: 1); + + for (i = 0; i < loc; i++) { + ini = ini->next; + if (unlikely(ini == &svc->dests)) + ini = ini->next; + } + + return ini; +} + /* * Lookup scheduler and try to load it if it doesn't exist */ diff --git a/src/ipvs/ip_vs_service.c b/src/ipvs/ip_vs_service.c index 536f4091c..424f0eab5 100644 --- a/src/ipvs/ip_vs_service.c +++ b/src/ipvs/ip_vs_service.c @@ -199,8 +199,8 @@ static inline bool __service_in_range(int af, static struct dp_vs_service * __dp_vs_service_match_get4(const struct rte_mbuf *mbuf, bool *outwall, lcoreid_t cid) { - struct route_entry *rt = mbuf->userdata; - struct ipv4_hdr *iph = ip4_hdr(mbuf); /* ipv4 only */ + struct route_entry *rt = MBUF_USERDATA_CONST(mbuf, struct route_entry *, MBUF_FIELD_ROUTE); + struct rte_ipv4_hdr *iph = ip4_hdr(mbuf); /* ipv4 only */ struct dp_vs_service *svc; union inet_addr saddr, daddr; __be16 _ports[2], *ports; @@ -267,7 +267,7 @@ __dp_vs_service_match_get4(const struct rte_mbuf *mbuf, bool *outwall, lcoreid_t static struct dp_vs_service * __dp_vs_service_match_get6(const struct rte_mbuf *mbuf, lcoreid_t cid) { - struct route6 *rt = mbuf->userdata; + struct route6 *rt = MBUF_USERDATA_CONST(mbuf, struct route6 *, MBUF_FIELD_ROUTE); struct ip6_hdr *iph = ip6_hdr(mbuf); uint8_t ip6nxt = iph->ip6_nxt; struct dp_vs_service *svc; @@ -299,7 +299,7 @@ __dp_vs_service_match_get6(const struct rte_mbuf *mbuf, lcoreid_t cid) if (!rt) return NULL; - /* set mbuf->userdata to @rt as side-effect is not good! + /* set mbuf userdata(MBUF_FIELD_ROUTE) to @rt as side-effect is not good! * although route will done again when out-xmit. */ if ((rt->rt6_flags & RTF_KNI) || (rt->rt6_flags & RTF_LOCALIN)) { route6_put(rt); @@ -914,13 +914,13 @@ static int dp_vs_service_set(sockoptid_t opt, const void *user, size_t len) struct in_addr *vip; lcoreid_t cid = rte_lcore_id(); - if (opt == DPVS_SO_SET_GRATARP && cid == rte_get_master_lcore()){ + if (opt == DPVS_SO_SET_GRATARP && cid == rte_get_main_lcore()){ vip = (struct in_addr *)user; return gratuitous_arp_send_vip(vip); } // send to slave core - if (cid == rte_get_master_lcore()) { + if (cid == rte_get_main_lcore()) { struct dpvs_msg *msg; msg = msg_make(set_opt_so2msg(opt), svc_msg_seq(), DPVS_MSG_MULTICAST, cid, len, user); @@ -1262,7 +1262,7 @@ static int dp_vs_service_get(sockoptid_t opt, const void *user, size_t len, void return EDPVS_MSG_FAIL; } - if (cid == rte_get_master_lcore()) { + if (cid == rte_get_main_lcore()) { output = rte_zmalloc("get_services", size, 0); if (unlikely(NULL == output)) { msg_destroy(&msg); @@ -1331,7 +1331,7 @@ static int dp_vs_service_get(sockoptid_t opt, const void *user, size_t len, void return EDPVS_MSG_FAIL; } - if (cid == rte_get_master_lcore()) { + if (cid == rte_get_main_lcore()) { svc = dp_vs_service_get_lcore(entry, cid); if (!svc) { msg_destroy(&msg); @@ -1426,7 +1426,7 @@ static int dp_vs_service_get(sockoptid_t opt, const void *user, size_t len, void return EDPVS_MSG_FAIL; } - if (cid == rte_get_master_lcore()) { + if (cid == rte_get_main_lcore()) { svc = dp_vs_service_get_lcore(&entry, cid); if (!svc) { msg_destroy(&msg); diff --git a/src/ipvs/ip_vs_synproxy.c b/src/ipvs/ip_vs_synproxy.c index 17efe7502..19e3003f5 100644 --- a/src/ipvs/ip_vs_synproxy.c +++ b/src/ipvs/ip_vs_synproxy.c @@ -633,7 +633,7 @@ static void syn_proxy_reuse_mbuf(int af, struct rte_mbuf *mbuf, } else { if (mbuf_may_pull(mbuf, mbuf->pkt_len) != 0) return; - tcp6_send_csum((struct ipv6_hdr*)ip6h, th); + tcp6_send_csum((struct rte_ipv6_hdr*)ip6h, th); } } else { uint32_t tmpaddr; @@ -649,17 +649,17 @@ static void syn_proxy_reuse_mbuf(int af, struct rte_mbuf *mbuf, if (likely(mbuf->ol_flags & PKT_TX_TCP_CKSUM)) { mbuf->l3_len = iphlen; mbuf->l4_len = (th->doff << 2); - th->check = rte_ipv4_phdr_cksum((struct ipv4_hdr*)iph, mbuf->ol_flags); + th->check = rte_ipv4_phdr_cksum((struct rte_ipv4_hdr*)iph, mbuf->ol_flags); } else { if (mbuf_may_pull(mbuf, mbuf->pkt_len) != 0) return; - tcp4_send_csum((struct ipv4_hdr*)iph, th); + tcp4_send_csum((struct rte_ipv4_hdr*)iph, th); } if (likely(mbuf->ol_flags & PKT_TX_IP_CKSUM)) iph->check = 0; else - ip4_send_csum((struct ipv4_hdr*)iph); + ip4_send_csum((struct rte_ipv4_hdr*)iph); } } @@ -682,8 +682,8 @@ int dp_vs_synproxy_syn_rcv(int af, struct rte_mbuf *mbuf, struct tcphdr *th, _tcph; struct dp_vs_synproxy_opt tcp_opt; struct netif_port *dev; - struct ether_hdr *eth; - struct ether_addr ethaddr; + struct rte_ether_hdr *eth; + struct rte_ether_addr ethaddr; th = mbuf_header_pointer(mbuf, iph->len, sizeof(_tcph), &_tcph); if (unlikely(NULL == th)) @@ -716,7 +716,7 @@ int dp_vs_synproxy_syn_rcv(int af, struct rte_mbuf *mbuf, /* mbuf will be reused and ether header will be set. * FIXME: to support non-ether packets. */ - if (mbuf->l2_len != sizeof(struct ether_hdr)) + if (mbuf->l2_len != sizeof(struct rte_ether_hdr)) goto syn_rcv_out; /* update statistics */ @@ -743,14 +743,14 @@ int dp_vs_synproxy_syn_rcv(int af, struct rte_mbuf *mbuf, /* set L2 header and send the packet out * It is noted that "ipv4_xmit" should not used here, * because mbuf is reused. */ - eth = (struct ether_hdr *)rte_pktmbuf_prepend(mbuf, mbuf->l2_len); + eth = (struct rte_ether_hdr *)rte_pktmbuf_prepend(mbuf, mbuf->l2_len); if (unlikely(!eth)) { RTE_LOG(ERR, IPVS, "%s: no memory\n", __func__); goto syn_rcv_out; } - memcpy(ðaddr, ð->s_addr, sizeof(struct ether_addr)); - memcpy(ð->s_addr, ð->d_addr, sizeof(struct ether_addr)); - memcpy(ð->d_addr, ðaddr, sizeof(struct ether_addr)); + memcpy(ðaddr, ð->s_addr, sizeof(struct rte_ether_addr)); + memcpy(ð->s_addr, ð->d_addr, sizeof(struct rte_ether_addr)); + memcpy(ð->d_addr, ðaddr, sizeof(struct rte_ether_addr)); if (unlikely(EDPVS_OK != (ret = netif_xmit(mbuf, dev)))) { RTE_LOG(ERR, IPVS, "%s: netif_xmit failed -- %s\n", @@ -842,7 +842,7 @@ static int syn_proxy_send_rs_syn(int af, const struct tcphdr *th, //RTE_LOG(WARNING, IPVS, "%s: %s\n", __func__, dpvs_strerror(EDPVS_NOMEM)); return EDPVS_NOMEM; } - syn_mbuf->userdata = NULL; /* make sure "no route info" */ + mbuf_userdata_reset(syn_mbuf); /* make sure "no route info" */ /* Reserve space for tcp header */ tcp_hdr_size = (sizeof(struct tcphdr) + TCPOLEN_MAXSEG @@ -899,7 +899,7 @@ static int syn_proxy_send_rs_syn(int af, const struct tcphdr *th, struct iphdr *syn_iph; /* Reserve space for ipv4 header */ - syn_iph = (struct iphdr *)rte_pktmbuf_prepend(syn_mbuf, sizeof(struct ipv4_hdr)); + syn_iph = (struct iphdr *)rte_pktmbuf_prepend(syn_mbuf, sizeof(struct rte_ipv4_hdr)); if (!syn_iph) { rte_pktmbuf_free(syn_mbuf); //RTE_LOG(WARNING, IPVS, "%s:%s\n", __func__, dpvs_strerror(EDPVS_NOROOM)); @@ -909,7 +909,7 @@ static int syn_proxy_send_rs_syn(int af, const struct tcphdr *th, ack_iph = (struct iphdr *)ip4_hdr(mbuf); *((uint16_t *) syn_iph) = htons((4 << 12) | (5 << 8) | (ack_iph->tos & 0x1E)); syn_iph->tot_len = htons(syn_mbuf->pkt_len); - syn_iph->frag_off = htons(IPV4_HDR_DF_FLAG); + syn_iph->frag_off = htons(RTE_IPV4_HDR_DF_FLAG); syn_iph->ttl = 64; syn_iph->protocol = IPPROTO_TCP; syn_iph->saddr = ack_iph->saddr; @@ -930,7 +930,7 @@ static int syn_proxy_send_rs_syn(int af, const struct tcphdr *th, return EDPVS_NOMEM; } - syn_mbuf_cloned->userdata = NULL; + mbuf_userdata_reset(syn_mbuf_cloned); cp->syn_mbuf = syn_mbuf_cloned; sp_dbg_stats32_inc(sp_syn_saved); rte_atomic32_set(&cp->syn_retry_max, dp_vs_synproxy_ctrl_syn_retry); @@ -1029,7 +1029,7 @@ static int syn_proxy_build_tcp_rst(int af, struct rte_mbuf *mbuf, } else { if (mbuf_may_pull(mbuf, mbuf->pkt_len) != 0) return EDPVS_INVPKT; - tcp6_send_csum((struct ipv6_hdr*)ip6h, th); + tcp6_send_csum((struct rte_ipv6_hdr*)ip6h, th); } } else { uint32_t tmpaddr; @@ -1046,17 +1046,17 @@ static int syn_proxy_build_tcp_rst(int af, struct rte_mbuf *mbuf, if (likely(mbuf->ol_flags & PKT_TX_TCP_CKSUM)) { mbuf->l3_len = l3_len; mbuf->l4_len = l4_len; - th->check = rte_ipv4_phdr_cksum((struct ipv4_hdr*)ip4h, mbuf->ol_flags); + th->check = rte_ipv4_phdr_cksum((struct rte_ipv4_hdr*)ip4h, mbuf->ol_flags); } else { if (mbuf_may_pull(mbuf, mbuf->pkt_len) != 0) return EDPVS_INVPKT; - tcp4_send_csum((struct ipv4_hdr*)ip4h, th); + tcp4_send_csum((struct rte_ipv4_hdr*)ip4h, th); } if (likely(mbuf->ol_flags & PKT_TX_IP_CKSUM)) ip4h->check = 0; else - ip4_send_csum((struct ipv4_hdr*)ip4h); + ip4_send_csum((struct rte_ipv4_hdr*)ip4h); } return EDPVS_OK; @@ -1068,8 +1068,8 @@ static int syn_proxy_send_tcp_rst(int af, struct rte_mbuf *mbuf) { struct tcphdr *th; struct netif_port *dev; - struct ether_hdr *eth; - struct ether_addr ethaddr; + struct rte_ether_hdr *eth; + struct rte_ether_addr ethaddr; uint32_t l3_len, l4_len; void *l3_hdr; @@ -1096,19 +1096,19 @@ static int syn_proxy_send_tcp_rst(int af, struct rte_mbuf *mbuf) th, l3_len, l4_len)) return EDPVS_INVPKT; - if (mbuf->l2_len < sizeof(struct ether_hdr)) + if (mbuf->l2_len < sizeof(struct rte_ether_hdr)) return EDPVS_INVPKT; /* set L2 header and send the packet out * It is noted that "ipv4_xmit" should not used here, * because mbuf is reused. */ - eth = (struct ether_hdr *)rte_pktmbuf_prepend(mbuf, mbuf->l2_len); + eth = (struct rte_ether_hdr *)rte_pktmbuf_prepend(mbuf, mbuf->l2_len); if (unlikely(!eth)) { RTE_LOG(ERR, IPVS, "%s: no memory\n", __func__); return EDPVS_NOMEM; } - memcpy(ðaddr, ð->s_addr, sizeof(struct ether_addr)); - memcpy(ð->s_addr, ð->d_addr, sizeof(struct ether_addr)); - memcpy(ð->d_addr, ðaddr, sizeof(struct ether_addr)); + memcpy(ðaddr, ð->s_addr, sizeof(struct rte_ether_addr)); + memcpy(ð->s_addr, ð->d_addr, sizeof(struct rte_ether_addr)); + memcpy(ð->d_addr, ðaddr, sizeof(struct rte_ether_addr)); dev = netif_port_get(mbuf->port); if (unlikely(!dev)) { @@ -1316,7 +1316,7 @@ static int syn_proxy_send_window_update(int af, struct rte_mbuf *mbuf, struct dp RTE_LOG(WARNING, IPVS, "%s: %s\n", __func__, dpvs_strerror(EDPVS_NOMEM)); return EDPVS_NOMEM; } - ack_mbuf->userdata = NULL; + mbuf_userdata_reset(ack_mbuf); ack_th = (struct tcphdr *)rte_pktmbuf_prepend(ack_mbuf, sizeof(struct tcphdr)); if (!ack_th) { @@ -1351,22 +1351,22 @@ static int syn_proxy_send_window_update(int af, struct rte_mbuf *mbuf, struct dp ack_ip6h->ip6_nxt = NEXTHDR_TCP; ack_mbuf->l3_len = sizeof(*ack_ip6h); } else { - struct ipv4_hdr *ack_iph; - struct ipv4_hdr *reuse_iph = ip4_hdr(mbuf); + struct rte_ipv4_hdr *ack_iph; + struct rte_ipv4_hdr *reuse_iph = ip4_hdr(mbuf); int pkt_ack_len = sizeof(struct tcphdr) + sizeof(struct iphdr); /* Reserve space for ipv4 header */ - ack_iph = (struct ipv4_hdr *)rte_pktmbuf_prepend(ack_mbuf, sizeof(struct ipv4_hdr)); + ack_iph = (struct rte_ipv4_hdr *)rte_pktmbuf_prepend(ack_mbuf, sizeof(struct rte_ipv4_hdr)); if (!ack_iph) { rte_pktmbuf_free(ack_mbuf); RTE_LOG(WARNING, IPVS, "%s:%s\n", __func__, dpvs_strerror(EDPVS_NOROOM)); return EDPVS_NOROOM; } - memcpy(ack_iph, reuse_iph, sizeof(struct ipv4_hdr)); + memcpy(ack_iph, reuse_iph, sizeof(struct rte_ipv4_hdr)); /* version and ip header length */ ack_iph->version_ihl = 0x45; ack_iph->type_of_service = 0; - ack_iph->fragment_offset = htons(IPV4_HDR_DF_FLAG); + ack_iph->fragment_offset = htons(RTE_IPV4_HDR_DF_FLAG); ack_iph->total_length = htons(pkt_ack_len); ack_mbuf->l3_len = sizeof(*ack_iph); } diff --git a/src/ipvs/ip_vs_whtlst.c b/src/ipvs/ip_vs_whtlst.c index e0be714bf..b017af0f3 100644 --- a/src/ipvs/ip_vs_whtlst.c +++ b/src/ipvs/ip_vs_whtlst.c @@ -151,7 +151,7 @@ static int dp_vs_whtlst_add(int af, uint8_t proto, const union inet_addr *vaddr, struct dpvs_msg *msg; struct dp_vs_whtlst_conf cf; - if (cid != rte_get_master_lcore()) { + if (cid != rte_get_main_lcore()) { RTE_LOG(INFO, SERVICE, "[%s] must set from master lcore\n", __func__); return EDPVS_NOTSUPP; } @@ -194,7 +194,7 @@ static int dp_vs_whtlst_del(int af, uint8_t proto, const union inet_addr *vaddr, struct dpvs_msg *msg; struct dp_vs_whtlst_conf cf; - if (cid != rte_get_master_lcore()) { + if (cid != rte_get_main_lcore()) { RTE_LOG(INFO, SERVICE, "[%s] must set from master lcore\n", __func__); return EDPVS_NOTSUPP; } @@ -444,8 +444,8 @@ int dp_vs_whtlst_init(void) rte_atomic32_set(&this_num_whtlsts, 0); - rte_eal_mp_remote_launch(whtlst_lcore_init, NULL, CALL_MASTER); - RTE_LCORE_FOREACH_SLAVE(cid) { + rte_eal_mp_remote_launch(whtlst_lcore_init, NULL, CALL_MAIN); + RTE_LCORE_FOREACH_WORKER(cid) { if ((err = rte_eal_wait_lcore(cid)) < 0) { RTE_LOG(WARNING, SERVICE, "%s: lcore %d: %s.\n", __func__, cid, dpvs_strerror(err)); @@ -497,8 +497,8 @@ int dp_vs_whtlst_term(void) if ((err = sockopt_unregister(&whtlst_sockopts)) != EDPVS_OK) return err; - rte_eal_mp_remote_launch(whtlst_lcore_term, NULL, CALL_MASTER); - RTE_LCORE_FOREACH_SLAVE(cid) { + rte_eal_mp_remote_launch(whtlst_lcore_term, NULL, CALL_MAIN); + RTE_LCORE_FOREACH_WORKER(cid) { if ((err = rte_eal_wait_lcore(cid)) < 0) { RTE_LOG(WARNING, SERVICE, "%s: lcore %d: %s.\n", __func__, cid, dpvs_strerror(err)); diff --git a/src/ipvs/ip_vs_wlc.c b/src/ipvs/ip_vs_wlc.c index 550853ada..1d695fbb0 100644 --- a/src/ipvs/ip_vs_wlc.c +++ b/src/ipvs/ip_vs_wlc.c @@ -26,9 +26,11 @@ static inline unsigned int dp_vs_wlc_dest_overhead(struct dp_vs_dest *dest) static struct dp_vs_dest *dp_vs_wlc_schedule(struct dp_vs_service *svc, const struct rte_mbuf *mbuf, const struct dp_vs_iphdr *iph __rte_unused) { + struct list_head *first, *cur; struct dp_vs_dest *dest, *least; unsigned int loh, doh; + first = dp_vs_sched_first_dest(svc); /* * We calculate the load of each dest server as follows: * (dest overhead) / dest->weight @@ -36,26 +38,36 @@ static struct dp_vs_dest *dp_vs_wlc_schedule(struct dp_vs_service *svc, * The server with weight=0 is quiesced and will not receive any * new connections. */ - - list_for_each_entry(dest, &svc->dests, n_list) { + cur = first; + do { + if (unlikely(cur == &svc->dests)) { + cur = cur->next; + continue; + } + dest = list_entry(cur, struct dp_vs_dest, n_list); if (dp_vs_dest_is_valid(dest)) { least = dest; loh = dp_vs_wlc_dest_overhead(least); goto nextstage; } - } + cur = cur->next; + } while (cur != first); + return NULL; /* * Find the destination with the least load. */ nextstage: - list_for_each_entry_continue(dest, &svc->dests, n_list) { + for (cur = cur->next; cur != first; cur = cur->next) { + if (unlikely(cur == &svc->dests)) + continue; + dest = list_entry(cur, struct dp_vs_dest, n_list); if (dest->flags & DPVS_DEST_F_OVERLOAD) continue; doh = dp_vs_wlc_dest_overhead(dest); if (loh * rte_atomic16_read(&dest->weight) > - doh * rte_atomic16_read(&least->weight)) { + doh * rte_atomic16_read(&least->weight)) { least = dest; loh = doh; } diff --git a/src/ipvs/ip_vs_wrr.c b/src/ipvs/ip_vs_wrr.c index a0a4551fb..cbe163e17 100644 --- a/src/ipvs/ip_vs_wrr.c +++ b/src/ipvs/ip_vs_wrr.c @@ -54,7 +54,7 @@ static int dp_vs_wrr_init_svc(struct dp_vs_service *svc) if (mark == NULL) { return EDPVS_NOMEM; } - mark->cl = &svc->dests; + mark->cl = dp_vs_sched_first_dest(svc); mark->cw = 0; mark->mw = dp_vs_wrr_max_weight(svc); mark->di = dp_vs_gcd_weight(svc); @@ -78,7 +78,7 @@ static int dp_vs_wrr_update_svc(struct dp_vs_service *svc, { struct dp_vs_wrr_mark *mark = svc->sched_data; - mark->cl = &svc->dests; + mark->cl = dp_vs_sched_first_dest(svc); mark->mw = dp_vs_wrr_max_weight(svc); mark->di = dp_vs_gcd_weight(svc); if (mark->cw > mark->mw) diff --git a/src/ipvs/ip_vs_xmit.c b/src/ipvs/ip_vs_xmit.c index 375313f60..3208eeb1e 100644 --- a/src/ipvs/ip_vs_xmit.c +++ b/src/ipvs/ip_vs_xmit.c @@ -37,16 +37,16 @@ static int __dp_vs_fast_xmit_fnat4(struct dp_vs_proto *proto, struct dp_vs_conn *conn, struct rte_mbuf *mbuf) { - struct ipv4_hdr *ip4h = ip4_hdr(mbuf); - struct ether_hdr *eth; - uint16_t packet_type = ETHER_TYPE_IPv4; + struct rte_ipv4_hdr *ip4h = ip4_hdr(mbuf); + struct rte_ether_hdr *eth; + uint16_t packet_type = RTE_ETHER_TYPE_IPV4; int err; if (unlikely(conn->in_dev == NULL)) return EDPVS_NOROUTE; - if (unlikely(is_zero_ether_addr(&conn->in_dmac) || - is_zero_ether_addr(&conn->in_smac))) + if (unlikely(rte_is_zero_ether_addr(&conn->in_dmac) || + rte_is_zero_ether_addr(&conn->in_smac))) return EDPVS_NOTSUPP; /* pre-handler before translation */ @@ -78,10 +78,10 @@ static int __dp_vs_fast_xmit_fnat4(struct dp_vs_proto *proto, ip4_send_csum(ip4h); } - eth = (struct ether_hdr *)rte_pktmbuf_prepend(mbuf, - (uint16_t)sizeof(struct ether_hdr)); - ether_addr_copy(&conn->in_dmac, ð->d_addr); - ether_addr_copy(&conn->in_smac, ð->s_addr); + eth = (struct rte_ether_hdr *)rte_pktmbuf_prepend(mbuf, + (uint16_t)sizeof(struct rte_ether_hdr)); + rte_ether_addr_copy(&conn->in_dmac, ð->d_addr); + rte_ether_addr_copy(&conn->in_smac, ð->s_addr); eth->ether_type = rte_cpu_to_be_16(packet_type); mbuf->packet_type = packet_type; @@ -98,15 +98,15 @@ static int __dp_vs_fast_xmit_fnat6(struct dp_vs_proto *proto, struct rte_mbuf *mbuf) { struct ip6_hdr *ip6h = ip6_hdr(mbuf); - struct ether_hdr *eth; - uint16_t packet_type = ETHER_TYPE_IPv6; + struct rte_ether_hdr *eth; + uint16_t packet_type = RTE_ETHER_TYPE_IPV6; int err; if (unlikely(conn->in_dev == NULL)) return EDPVS_NOROUTE; - if (unlikely(is_zero_ether_addr(&conn->in_dmac) || - is_zero_ether_addr(&conn->in_smac))) + if (unlikely(rte_is_zero_ether_addr(&conn->in_dmac) || + rte_is_zero_ether_addr(&conn->in_smac))) return EDPVS_NOTSUPP; /* pre-handler before translation */ @@ -131,10 +131,10 @@ static int __dp_vs_fast_xmit_fnat6(struct dp_vs_proto *proto, return err; } - eth = (struct ether_hdr *)rte_pktmbuf_prepend(mbuf, - (uint16_t)sizeof(struct ether_hdr)); - ether_addr_copy(&conn->in_dmac, ð->d_addr); - ether_addr_copy(&conn->in_smac, ð->s_addr); + eth = (struct rte_ether_hdr *)rte_pktmbuf_prepend(mbuf, + (uint16_t)sizeof(struct rte_ether_hdr)); + rte_ether_addr_copy(&conn->in_dmac, ð->d_addr); + rte_ether_addr_copy(&conn->in_smac, ð->s_addr); eth->ether_type = rte_cpu_to_be_16(packet_type); mbuf->packet_type = packet_type; @@ -159,16 +159,16 @@ static int __dp_vs_fast_outxmit_fnat4(struct dp_vs_proto *proto, struct dp_vs_conn *conn, struct rte_mbuf *mbuf) { - struct ipv4_hdr *ip4h = ip4_hdr(mbuf); - struct ether_hdr *eth; - uint16_t packet_type = ETHER_TYPE_IPv4; + struct rte_ipv4_hdr *ip4h = ip4_hdr(mbuf); + struct rte_ether_hdr *eth; + uint16_t packet_type = RTE_ETHER_TYPE_IPV4; int err; if (unlikely(conn->out_dev == NULL)) return EDPVS_NOROUTE; - if (unlikely(is_zero_ether_addr(&conn->out_dmac) || - is_zero_ether_addr(&conn->out_smac))) + if (unlikely(rte_is_zero_ether_addr(&conn->out_dmac) || + rte_is_zero_ether_addr(&conn->out_smac))) return EDPVS_NOTSUPP; /* pre-handler before translation */ @@ -200,10 +200,10 @@ static int __dp_vs_fast_outxmit_fnat4(struct dp_vs_proto *proto, ip4_send_csum(ip4h); } - eth = (struct ether_hdr *)rte_pktmbuf_prepend(mbuf, - (uint16_t)sizeof(struct ether_hdr)); - ether_addr_copy(&conn->out_dmac, ð->d_addr); - ether_addr_copy(&conn->out_smac, ð->s_addr); + eth = (struct rte_ether_hdr *)rte_pktmbuf_prepend(mbuf, + (uint16_t)sizeof(struct rte_ether_hdr)); + rte_ether_addr_copy(&conn->out_dmac, ð->d_addr); + rte_ether_addr_copy(&conn->out_smac, ð->s_addr); eth->ether_type = rte_cpu_to_be_16(packet_type); mbuf->packet_type = packet_type; @@ -220,15 +220,15 @@ static int __dp_vs_fast_outxmit_fnat6(struct dp_vs_proto *proto, struct rte_mbuf *mbuf) { struct ip6_hdr *ip6h = ip6_hdr(mbuf); - struct ether_hdr *eth; - uint16_t packet_type = ETHER_TYPE_IPv6; + struct rte_ether_hdr *eth; + uint16_t packet_type = RTE_ETHER_TYPE_IPV6; int err; if (unlikely(conn->out_dev == NULL)) return EDPVS_NOROUTE; - if (unlikely(is_zero_ether_addr(&conn->out_dmac) || - is_zero_ether_addr(&conn->out_smac))) + if (unlikely(rte_is_zero_ether_addr(&conn->out_dmac) || + rte_is_zero_ether_addr(&conn->out_smac))) return EDPVS_NOTSUPP; /* pre-handler before translation */ @@ -253,10 +253,10 @@ static int __dp_vs_fast_outxmit_fnat6(struct dp_vs_proto *proto, return err; } - eth = (struct ether_hdr *)rte_pktmbuf_prepend(mbuf, - (uint16_t)sizeof(struct ether_hdr)); - ether_addr_copy(&conn->out_dmac, ð->d_addr); - ether_addr_copy(&conn->out_smac, ð->s_addr); + eth = (struct rte_ether_hdr *)rte_pktmbuf_prepend(mbuf, + (uint16_t)sizeof(struct rte_ether_hdr)); + rte_ether_addr_copy(&conn->out_dmac, ð->d_addr); + rte_ether_addr_copy(&conn->out_smac, ð->s_addr); eth->ether_type = rte_cpu_to_be_16(packet_type); mbuf->packet_type = packet_type; @@ -285,26 +285,26 @@ static void dp_vs_save_xmit_info(struct rte_mbuf *mbuf, struct dp_vs_proto *proto, struct dp_vs_conn *conn) { - struct ether_hdr *eth = NULL; + struct rte_ether_hdr *eth = NULL; struct netif_port *port = NULL; - if (!is_zero_ether_addr(&conn->out_dmac) && - !is_zero_ether_addr(&conn->out_smac)) + if (!rte_is_zero_ether_addr(&conn->out_dmac) && + !rte_is_zero_ether_addr(&conn->out_smac)) return; - if (unlikely(mbuf->l2_len != sizeof(struct ether_hdr))) + if (unlikely(mbuf->l2_len != sizeof(struct rte_ether_hdr))) return; port = netif_port_get(mbuf->port); if (port) conn->out_dev = port; - eth = (struct ether_hdr *)rte_pktmbuf_prepend(mbuf, mbuf->l2_len); + eth = (struct rte_ether_hdr *)rte_pktmbuf_prepend(mbuf, mbuf->l2_len); - ether_addr_copy(ð->s_addr, &conn->out_dmac); - ether_addr_copy(ð->d_addr, &conn->out_smac); + rte_ether_addr_copy(ð->s_addr, &conn->out_dmac); + rte_ether_addr_copy(ð->d_addr, &conn->out_smac); - rte_pktmbuf_adj(mbuf, sizeof(struct ether_hdr)); + rte_pktmbuf_adj(mbuf, sizeof(struct rte_ether_hdr)); } /* @@ -314,26 +314,26 @@ static void dp_vs_save_outxmit_info(struct rte_mbuf *mbuf, struct dp_vs_proto *proto, struct dp_vs_conn *conn) { - struct ether_hdr *eth = NULL; + struct rte_ether_hdr *eth = NULL; struct netif_port *port = NULL; - if (!is_zero_ether_addr(&conn->in_dmac) && - !is_zero_ether_addr(&conn->in_smac)) + if (!rte_is_zero_ether_addr(&conn->in_dmac) && + !rte_is_zero_ether_addr(&conn->in_smac)) return; - if (mbuf->l2_len != sizeof(struct ether_hdr)) + if (mbuf->l2_len != sizeof(struct rte_ether_hdr)) return; port = netif_port_get(mbuf->port); if (port) conn->in_dev = port; - eth = (struct ether_hdr *)rte_pktmbuf_prepend(mbuf, mbuf->l2_len); + eth = (struct rte_ether_hdr *)rte_pktmbuf_prepend(mbuf, mbuf->l2_len); - ether_addr_copy(ð->s_addr, &conn->in_dmac); - ether_addr_copy(ð->d_addr, &conn->in_smac); + rte_ether_addr_copy(ð->s_addr, &conn->in_dmac); + rte_ether_addr_copy(ð->d_addr, &conn->in_smac); - rte_pktmbuf_adj(mbuf, sizeof(struct ether_hdr)); + rte_pktmbuf_adj(mbuf, sizeof(struct rte_ether_hdr)); } /* @@ -393,7 +393,7 @@ static int __dp_vs_xmit_fnat4(struct dp_vs_proto *proto, struct rte_mbuf *mbuf) { struct flow4 fl4; - struct ipv4_hdr *iph = ip4_hdr(mbuf); + struct rte_ipv4_hdr *iph = ip4_hdr(mbuf); struct route_entry *rt; int err, mtu; @@ -408,10 +408,10 @@ static int __dp_vs_xmit_fnat4(struct dp_vs_proto *proto, * drop old route. just for safe, because * FNAT is PRE_ROUTING, should not have route. */ - if (unlikely(mbuf->userdata != NULL)) { - RTE_LOG(WARNING, IPVS, "%s: FNAT have route %p ?\n", - __func__, mbuf->userdata); - route4_put((struct route_entry *)mbuf->userdata); + if (unlikely(MBUF_USERDATA(mbuf, struct route_entry *, MBUF_FIELD_ROUTE) != NULL)) { + RTE_LOG(WARNING, IPVS, "%s: FNAT have route %p ?\n", __func__, + MBUF_USERDATA(mbuf, struct route_entry *, MBUF_FIELD_ROUTE)); + route4_put(MBUF_USERDATA(mbuf, struct route_entry *, MBUF_FIELD_ROUTE)); } memset(&fl4, 0, sizeof(struct flow4)); @@ -433,14 +433,14 @@ static int __dp_vs_xmit_fnat4(struct dp_vs_proto *proto, mtu = rt->mtu; if (mbuf->pkt_len > mtu - && (iph->fragment_offset & htons(IPV4_HDR_DF_FLAG))) { + && (iph->fragment_offset & htons(RTE_IPV4_HDR_DF_FLAG))) { RTE_LOG(DEBUG, IPVS, "%s: frag needed.\n", __func__); icmp_send(mbuf, ICMP_DEST_UNREACH, ICMP_UNREACH_NEEDFRAG, htonl(mtu)); err = EDPVS_FRAG; goto errout; } - mbuf->userdata = rt; + MBUF_USERDATA(mbuf, struct route_entry *, MBUF_FIELD_ROUTE) = rt; /* after route lookup and before translation */ if (xmit_ttl) { @@ -514,10 +514,10 @@ static int __dp_vs_xmit_fnat6(struct dp_vs_proto *proto, * drop old route. just for safe, because * FNAT is PRE_ROUTING, should not have route. */ - if (unlikely(mbuf->userdata != NULL)) { + if (unlikely(MBUF_USERDATA(mbuf, struct route6 *, MBUF_FIELD_ROUTE) != NULL)) { RTE_LOG(WARNING, IPVS, "%s: FNAT have route %p ?\n", - __func__, mbuf->userdata); - route6_put((struct route6 *)mbuf->userdata); + __func__, MBUF_USERDATA(mbuf, struct route6 *, MBUF_FIELD_ROUTE)); + route6_put(MBUF_USERDATA(mbuf, struct route6 *, MBUF_FIELD_ROUTE)); } memset(&fl6, 0, sizeof(struct flow6)); @@ -546,7 +546,7 @@ static int __dp_vs_xmit_fnat6(struct dp_vs_proto *proto, goto errout; } - mbuf->userdata = rt6; + MBUF_USERDATA(mbuf, struct route6 *, MBUF_FIELD_ROUTE) = rt6; /* after route lookup and before translation */ if (xmit_ttl) { @@ -599,7 +599,7 @@ static int __dp_vs_xmit_fnat64(struct dp_vs_proto *proto, { struct flow4 fl4; struct ip6_hdr *ip6h = ip6_hdr(mbuf); - struct ipv4_hdr *ip4h; + struct rte_ipv4_hdr *ip4h; uint32_t pkt_len; struct route_entry *rt; int err, mtu; @@ -608,10 +608,10 @@ static int __dp_vs_xmit_fnat64(struct dp_vs_proto *proto, * drop old route. just for safe, because * FNAT is PRE_ROUTING, should not have route. */ - if (unlikely(mbuf->userdata != NULL)) { + if (unlikely(MBUF_USERDATA(mbuf, struct route6 *, MBUF_FIELD_ROUTE) != NULL)) { RTE_LOG(WARNING, IPVS, "%s: FNAT have route %p ?\n", - __func__, mbuf->userdata); - route6_put((struct route6 *)mbuf->userdata); + __func__, MBUF_USERDATA(mbuf, struct route6 *, MBUF_FIELD_ROUTE)); + route6_put(MBUF_USERDATA(mbuf, struct route6 *, MBUF_FIELD_ROUTE)); } memset(&fl4, 0, sizeof(struct flow4)); @@ -644,7 +644,7 @@ static int __dp_vs_xmit_fnat64(struct dp_vs_proto *proto, goto errout; } - mbuf->userdata = rt; + MBUF_USERDATA(mbuf, struct route_entry *, MBUF_FIELD_ROUTE) = rt; /* after route lookup and before translation */ if (xmit_ttl) { if (unlikely(ip6h->ip6_hops <= 1)) { @@ -718,7 +718,7 @@ static int __dp_vs_out_xmit_fnat4(struct dp_vs_proto *proto, struct rte_mbuf *mbuf) { struct flow4 fl4; - struct ipv4_hdr *iph = ip4_hdr(mbuf); + struct rte_ipv4_hdr *iph = ip4_hdr(mbuf); struct route_entry *rt; int err, mtu; @@ -733,8 +733,8 @@ static int __dp_vs_out_xmit_fnat4(struct dp_vs_proto *proto, * drop old route. just for safe, because * FNAT is PRE_ROUTING, should not have route. */ - if (unlikely(mbuf->userdata != NULL)) - route4_put((struct route_entry *)mbuf->userdata); + if (MBUF_USERDATA(mbuf, struct route_entry *, MBUF_FIELD_ROUTE) != NULL) + route4_put(MBUF_USERDATA(mbuf, struct route_entry *, MBUF_FIELD_ROUTE)); memset(&fl4, 0, sizeof(struct flow4)); fl4.fl4_daddr = conn->caddr.in; @@ -755,14 +755,14 @@ static int __dp_vs_out_xmit_fnat4(struct dp_vs_proto *proto, mtu = rt->mtu; if (mbuf->pkt_len > mtu - && (iph->fragment_offset & htons(IPV4_HDR_DF_FLAG))) { + && (iph->fragment_offset & htons(RTE_IPV4_HDR_DF_FLAG))) { RTE_LOG(DEBUG, IPVS, "%s: frag needed.\n", __func__); icmp_send(mbuf, ICMP_DEST_UNREACH, ICMP_UNREACH_NEEDFRAG, htonl(mtu)); err = EDPVS_FRAG; goto errout; } - mbuf->userdata = rt; + MBUF_USERDATA(mbuf, struct route_entry *, MBUF_FIELD_ROUTE) = rt; /* after route lookup and before translation */ if (xmit_ttl) { @@ -836,8 +836,8 @@ static int __dp_vs_out_xmit_fnat6(struct dp_vs_proto *proto, * drop old route. just for safe, because * FNAT is PRE_ROUTING, should not have route. */ - if (unlikely(mbuf->userdata != NULL)) - route6_put((struct route6 *)mbuf->userdata); + if (unlikely(MBUF_USERDATA(mbuf, struct route6 *, MBUF_FIELD_ROUTE) != NULL)) + route6_put(MBUF_USERDATA(mbuf, struct route6 *, MBUF_FIELD_ROUTE)); memset(&fl6, 0, sizeof(struct flow6)); fl6.fl6_daddr = conn->caddr.in6; @@ -863,7 +863,7 @@ static int __dp_vs_out_xmit_fnat6(struct dp_vs_proto *proto, goto errout; } - mbuf->userdata = rt6; + MBUF_USERDATA(mbuf, struct route6 *, MBUF_FIELD_ROUTE) = rt6; /* after route lookup and before translation */ if (xmit_ttl) { @@ -915,7 +915,7 @@ static int __dp_vs_out_xmit_fnat46(struct dp_vs_proto *proto, struct rte_mbuf *mbuf) { struct flow6 fl6; - struct ipv4_hdr *ip4h = ip4_hdr(mbuf); + struct rte_ipv4_hdr *ip4h = ip4_hdr(mbuf); uint32_t pkt_len; struct route6 *rt6; int err, mtu; @@ -924,10 +924,10 @@ static int __dp_vs_out_xmit_fnat46(struct dp_vs_proto *proto, * drop old route. just for safe, because * FNAT is PRE_ROUTING, should not have route. */ - if (unlikely(mbuf->userdata != NULL)) { - RTE_LOG(WARNING, IPVS, "%s: FNAT have route %p ?\n", - __func__, mbuf->userdata); - route4_put((struct route_entry *)mbuf->userdata); + if (unlikely(MBUF_USERDATA(mbuf, struct route_entry *, MBUF_FIELD_ROUTE) != NULL)) { + RTE_LOG(WARNING, IPVS, "%s: FNAT have route %p ?\n", __func__, + MBUF_USERDATA(mbuf, struct route_entry *, MBUF_FIELD_ROUTE)); + route4_put(MBUF_USERDATA(mbuf, struct route_entry *, MBUF_FIELD_ROUTE)); } memset(&fl6, 0, sizeof(struct flow6)); @@ -953,14 +953,14 @@ static int __dp_vs_out_xmit_fnat46(struct dp_vs_proto *proto, mtu = rt6->rt6_mtu; pkt_len = mbuf_nat4to6_len(mbuf); if (pkt_len > mtu - && (ip4h->fragment_offset & htons(IPV4_HDR_DF_FLAG))) { + && (ip4h->fragment_offset & htons(RTE_IPV4_HDR_DF_FLAG))) { RTE_LOG(DEBUG, IPVS, "%s: frag needed.\n", __func__); icmp_send(mbuf, ICMP_DEST_UNREACH, ICMP_UNREACH_NEEDFRAG, htonl(mtu)); err = EDPVS_FRAG; goto errout; } - mbuf->userdata = rt6; + MBUF_USERDATA(mbuf, struct route6 *, MBUF_FIELD_ROUTE) = rt6; /* after route lookup and before translation */ if (xmit_ttl) { if (unlikely(ip4h->time_to_live <= 1)) { @@ -1026,10 +1026,10 @@ static void __dp_vs_xmit_icmp4(struct rte_mbuf *mbuf, struct dp_vs_proto *prot, struct dp_vs_conn *conn, int dir) { - struct ipv4_hdr *iph = ip4_hdr(mbuf); + struct rte_ipv4_hdr *iph = ip4_hdr(mbuf); struct icmphdr *icmph = (struct icmphdr *) ((unsigned char *)ip4_hdr(mbuf) + ip4_hdrlen(mbuf)); - struct ipv4_hdr *ciph = (struct ipv4_hdr *)(icmph + 1); + struct rte_ipv4_hdr *ciph = (struct rte_ipv4_hdr *)(icmph + 1); int fullnat = (conn->dest->fwdmode == DPVS_FWD_MODE_FNAT); uint16_t csum; @@ -1067,7 +1067,7 @@ static void __dp_vs_xmit_icmp4(struct rte_mbuf *mbuf, if (ciph->next_proto_id == IPPROTO_TCP || ciph->next_proto_id == IPPROTO_UDP) { uint16_t *ports = (void *)ciph + \ - ((ciph->version_ihl & IPV4_HDR_IHL_MASK)<<2); + ((ciph->version_ihl & RTE_IPV4_HDR_IHL_MASK)<<2); if (fullnat) { if (dir == DPVS_CONN_DIR_INBOUND) { @@ -1200,7 +1200,7 @@ static void __dp_vs_xmit_icmp6(struct rte_mbuf *mbuf, icmp6h->icmp6_cksum = 0; l4_len = ntohs(ip6h->ip6_plen); csum = rte_raw_cksum(icmp6h, l4_len); - csum += rte_ipv6_phdr_cksum((struct ipv6_hdr *)ip6h, 0); + csum += rte_ipv6_phdr_cksum((struct rte_ipv6_hdr *)ip6h, 0); csum = ((csum & 0xffff0000) >> 16) + (csum & 0xffff); csum = (~csum) & 0xffff; @@ -1230,14 +1230,14 @@ static int __dp_vs_xmit_dr4(struct dp_vs_proto *proto, struct rte_mbuf *mbuf) { struct flow4 fl4; - struct ipv4_hdr *iph = ip4_hdr(mbuf); + struct rte_ipv4_hdr *iph = ip4_hdr(mbuf); struct route_entry *rt; int err, mtu; - if (unlikely(mbuf->userdata != NULL)) { - RTE_LOG(WARNING, IPVS, "%s: Already have route %p ?\n", - __func__, mbuf->userdata); - route4_put((struct route_entry *)mbuf->userdata); + if (unlikely(MBUF_USERDATA(mbuf, struct route_entry *, MBUF_FIELD_ROUTE) != NULL)) { + RTE_LOG(WARNING, IPVS, "%s: Already have route %p ?\n", __func__, + MBUF_USERDATA(mbuf, struct route_entry *, MBUF_FIELD_ROUTE)); + route4_put(MBUF_USERDATA(mbuf, struct route_entry *, MBUF_FIELD_ROUTE)); } memset(&fl4, 0, sizeof(struct flow4)); @@ -1255,14 +1255,14 @@ static int __dp_vs_xmit_dr4(struct dp_vs_proto *proto, mtu = rt->mtu; if (mbuf->pkt_len > mtu - && (iph->fragment_offset & htons(IPV4_HDR_DF_FLAG))) { + && (iph->fragment_offset & htons(RTE_IPV4_HDR_DF_FLAG))) { RTE_LOG(DEBUG, IPVS, "%s: frag needed.\n", __func__); icmp_send(mbuf, ICMP_DEST_UNREACH, ICMP_UNREACH_NEEDFRAG, htonl(mtu)); err = EDPVS_FRAG; goto errout; } - mbuf->packet_type = ETHER_TYPE_IPv4; + mbuf->packet_type = RTE_ETHER_TYPE_IPV4; err = neigh_output(AF_INET, (union inet_addr *)&conn->daddr.in, mbuf, rt->port); route4_put(rt); return err; @@ -1283,10 +1283,10 @@ static int __dp_vs_xmit_dr6(struct dp_vs_proto *proto, struct route6 *rt6; int err, mtu; - if (unlikely(mbuf->userdata != NULL)) { - RTE_LOG(WARNING, IPVS, "%s: Already have route %p ?\n", - __func__, mbuf->userdata); - route6_put((struct route6 *)mbuf->userdata); + if (unlikely(MBUF_USERDATA(mbuf, struct route6 *, MBUF_FIELD_ROUTE) != NULL)) { + RTE_LOG(WARNING, IPVS, "%s: Already have route %p ?\n", __func__, + MBUF_USERDATA(mbuf, struct route6 *, MBUF_FIELD_ROUTE)); + route6_put(MBUF_USERDATA(mbuf, struct route6 *, MBUF_FIELD_ROUTE)); } memset(&fl6, 0, sizeof(struct flow6)); @@ -1309,7 +1309,7 @@ static int __dp_vs_xmit_dr6(struct dp_vs_proto *proto, goto errout; } - mbuf->packet_type = ETHER_TYPE_IPv6; + mbuf->packet_type = RTE_ETHER_TYPE_IPV6; err = neigh_output(AF_INET6, (union inet_addr *)&conn->daddr.in6, mbuf, rt6->rt6_dev); route6_put(rt6); return err; @@ -1338,7 +1338,7 @@ static int __dp_vs_xmit_snat4(struct dp_vs_proto *proto, struct rte_mbuf *mbuf) { struct flow4 fl4; - struct ipv4_hdr *iph = ip4_hdr(mbuf); + struct rte_ipv4_hdr *iph = ip4_hdr(mbuf); struct route_entry *rt; int err, mtu; @@ -1347,10 +1347,10 @@ static int __dp_vs_xmit_snat4(struct dp_vs_proto *proto, * inbound SNAT traffic is hooked at PRE_ROUTING, * should not have route. */ - if (unlikely(mbuf->userdata != NULL)) { - RTE_LOG(WARNING, IPVS, "%s: SNAT have route %p ?\n", - __func__, mbuf->userdata); - route4_put((struct route_entry *)mbuf->userdata); + if (unlikely(MBUF_USERDATA(mbuf, struct route_entry *, MBUF_FIELD_ROUTE) != NULL)) { + RTE_LOG(WARNING, IPVS, "%s: SNAT have route %p ?\n", __func__, + MBUF_USERDATA(mbuf, struct route_entry *, MBUF_FIELD_ROUTE)); + route4_put(MBUF_USERDATA(mbuf, struct route_entry *, MBUF_FIELD_ROUTE)); } /* @@ -1371,14 +1371,14 @@ static int __dp_vs_xmit_snat4(struct dp_vs_proto *proto, mtu = rt->mtu; if (mbuf->pkt_len > mtu - && (iph->fragment_offset & htons(IPV4_HDR_DF_FLAG))) { + && (iph->fragment_offset & htons(RTE_IPV4_HDR_DF_FLAG))) { RTE_LOG(DEBUG, IPVS, "%s: frag needed.\n", __func__); icmp_send(mbuf, ICMP_DEST_UNREACH, ICMP_UNREACH_NEEDFRAG, htonl(mtu)); err = EDPVS_FRAG; goto errout; } - mbuf->userdata = rt; + MBUF_USERDATA(mbuf, struct route_entry *, MBUF_FIELD_ROUTE) = rt; /* after route lookup and before translation */ if (xmit_ttl) { @@ -1432,10 +1432,10 @@ static int __dp_vs_xmit_snat6(struct dp_vs_proto *proto, * inbound SNAT traffic is hooked at PRE_ROUTING, * should not have route. */ - if (unlikely(mbuf->userdata != NULL)) { - RTE_LOG(WARNING, IPVS, "%s: SNAT have route %p ?\n", - __func__, mbuf->userdata); - route6_put((struct route6 *)mbuf->userdata); + if (unlikely(MBUF_USERDATA(mbuf, struct route6 *, MBUF_FIELD_ROUTE) != NULL)) { + RTE_LOG(WARNING, IPVS, "%s: SNAT have route %p ?\n", __func__, + MBUF_USERDATA(mbuf, struct route6*, MBUF_FIELD_ROUTE)); + route6_put(MBUF_USERDATA(mbuf, struct route6*, MBUF_FIELD_ROUTE)); } /* @@ -1461,7 +1461,7 @@ static int __dp_vs_xmit_snat6(struct dp_vs_proto *proto, goto errout; } - mbuf->userdata = rt6; + MBUF_USERDATA(mbuf, struct route6 *, MBUF_FIELD_ROUTE) = rt6; /* after route lookup and before translation */ if (xmit_ttl) { @@ -1512,8 +1512,8 @@ static int __dp_vs_out_xmit_snat4(struct dp_vs_proto *proto, { int err; struct flow4 fl4; - struct route_entry *rt = mbuf->userdata; - struct ipv4_hdr *iph = ip4_hdr(mbuf); + struct route_entry *rt = MBUF_USERDATA(mbuf, struct route_entry *, MBUF_FIELD_ROUTE); + struct rte_ipv4_hdr *iph = ip4_hdr(mbuf); if (!rt) { memset(&fl4, 0, sizeof(struct flow4)); @@ -1535,13 +1535,13 @@ static int __dp_vs_out_xmit_snat4(struct dp_vs_proto *proto, goto errout; } } - mbuf->userdata = rt; + MBUF_USERDATA(mbuf, struct route_entry *, MBUF_FIELD_ROUTE) = rt; dp_vs_conn_cache_rt(conn, rt, false); } if (mbuf->pkt_len > rt->mtu && - (iph->fragment_offset & htons(IPV4_HDR_DF_FLAG))) { + (iph->fragment_offset & htons(RTE_IPV4_HDR_DF_FLAG))) { RTE_LOG(DEBUG, IPVS, "%s: frag needed.\n", __func__); icmp_send(mbuf, ICMP_DEST_UNREACH, ICMP_UNREACH_NEEDFRAG, htonl(rt->mtu)); @@ -1591,15 +1591,15 @@ static int dp_vs_fast_xmit_nat(struct dp_vs_proto *proto, struct dp_vs_conn *conn, struct rte_mbuf *mbuf) { - struct ipv4_hdr *iph = ip4_hdr(mbuf); - struct ether_hdr *eth; + struct rte_ipv4_hdr *iph = ip4_hdr(mbuf); + struct rte_ether_hdr *eth; int err; if (unlikely(conn->in_dev == NULL)) return EDPVS_NOROUTE; - if (unlikely(is_zero_ether_addr(&conn->in_dmac) || - is_zero_ether_addr(&conn->in_smac))) + if (unlikely(rte_is_zero_ether_addr(&conn->in_dmac) || + rte_is_zero_ether_addr(&conn->in_smac))) return EDPVS_NOTSUPP; iph->hdr_checksum = 0; @@ -1617,12 +1617,12 @@ static int dp_vs_fast_xmit_nat(struct dp_vs_proto *proto, ip4_send_csum(iph); } - eth = (struct ether_hdr *)rte_pktmbuf_prepend(mbuf, - (uint16_t)sizeof(struct ether_hdr)); - ether_addr_copy(&conn->in_dmac, ð->d_addr); - ether_addr_copy(&conn->in_smac, ð->s_addr); - eth->ether_type = rte_cpu_to_be_16(ETHER_TYPE_IPv4); - mbuf->packet_type = ETHER_TYPE_IPv4; + eth = (struct rte_ether_hdr *)rte_pktmbuf_prepend(mbuf, + (uint16_t)sizeof(struct rte_ether_hdr)); + rte_ether_addr_copy(&conn->in_dmac, ð->d_addr); + rte_ether_addr_copy(&conn->in_smac, ð->s_addr); + eth->ether_type = rte_cpu_to_be_16(RTE_ETHER_TYPE_IPV4); + mbuf->packet_type = RTE_ETHER_TYPE_IPV4; err = netif_xmit(mbuf, conn->in_dev); if (err != EDPVS_OK) @@ -1636,15 +1636,15 @@ static int dp_vs_fast_outxmit_nat(struct dp_vs_proto *proto, struct dp_vs_conn *conn, struct rte_mbuf *mbuf) { - struct ipv4_hdr *iph = ip4_hdr(mbuf); - struct ether_hdr *eth; + struct rte_ipv4_hdr *iph = ip4_hdr(mbuf); + struct rte_ether_hdr *eth; int err; if (unlikely(conn->out_dev == NULL)) return EDPVS_NOROUTE; - if (unlikely(is_zero_ether_addr(&conn->out_dmac) || - is_zero_ether_addr(&conn->out_smac))) + if (unlikely(rte_is_zero_ether_addr(&conn->out_dmac) || + rte_is_zero_ether_addr(&conn->out_smac))) return EDPVS_NOTSUPP; iph->hdr_checksum = 0; @@ -1662,12 +1662,12 @@ static int dp_vs_fast_outxmit_nat(struct dp_vs_proto *proto, ip4_send_csum(iph); } - eth = (struct ether_hdr *)rte_pktmbuf_prepend(mbuf, - (uint16_t)sizeof(struct ether_hdr)); - ether_addr_copy(&conn->out_dmac, ð->d_addr); - ether_addr_copy(&conn->out_smac, ð->s_addr); - eth->ether_type = rte_cpu_to_be_16(ETHER_TYPE_IPv4); - mbuf->packet_type = ETHER_TYPE_IPv4; + eth = (struct rte_ether_hdr *)rte_pktmbuf_prepend(mbuf, + (uint16_t)sizeof(struct rte_ether_hdr)); + rte_ether_addr_copy(&conn->out_dmac, ð->d_addr); + rte_ether_addr_copy(&conn->out_smac, ð->s_addr); + eth->ether_type = rte_cpu_to_be_16(RTE_ETHER_TYPE_IPV4); + mbuf->packet_type = RTE_ETHER_TYPE_IPV4; err = netif_xmit(mbuf, conn->out_dev); if (err != EDPVS_OK) @@ -1683,7 +1683,7 @@ static int __dp_vs_out_xmit_snat6(struct dp_vs_proto *proto, { int err; struct flow6 fl6; - struct route6 *rt6 = mbuf->userdata; + struct route6 *rt6 = MBUF_USERDATA(mbuf, struct route6 *, MBUF_FIELD_ROUTE); struct ip6_hdr *ip6h = ip6_hdr(mbuf); if (!rt6) { @@ -1696,7 +1696,7 @@ static int __dp_vs_out_xmit_snat6(struct dp_vs_proto *proto, goto errout; } - mbuf->userdata = rt6; + MBUF_USERDATA(mbuf, struct route6 *, MBUF_FIELD_ROUTE) = rt6; dp_vs_conn_cache_rt6(conn, rt6, false); } @@ -1756,7 +1756,7 @@ static int __dp_vs_xmit_nat4(struct dp_vs_proto *proto, struct rte_mbuf *mbuf) { struct flow4 fl4; - struct ipv4_hdr *iph = ip4_hdr(mbuf); + struct rte_ipv4_hdr *iph = ip4_hdr(mbuf); struct route_entry *rt; int err, mtu; @@ -1771,10 +1771,10 @@ static int __dp_vs_xmit_nat4(struct dp_vs_proto *proto, * drop old route. just for safe, because * NAT is PREROUTING, should not have route. */ - if (unlikely(mbuf->userdata != NULL)) { - RTE_LOG(WARNING, IPVS, "%s: NAT have route %p ?\n", - __func__, mbuf->userdata); - route4_put((struct route_entry*)mbuf->userdata); + if (unlikely(MBUF_USERDATA(mbuf, struct route_entry *, MBUF_FIELD_ROUTE) != NULL)) { + RTE_LOG(WARNING, IPVS, "%s: NAT have route %p ?\n", __func__, + MBUF_USERDATA(mbuf, struct route_entry *, MBUF_FIELD_ROUTE)); + route4_put(MBUF_USERDATA(mbuf, struct route_entry *, MBUF_FIELD_ROUTE)); } memset(&fl4, 0, sizeof(struct flow4)); @@ -1791,7 +1791,7 @@ static int __dp_vs_xmit_nat4(struct dp_vs_proto *proto, mtu = rt->mtu; if (mbuf->pkt_len > mtu - && (iph->fragment_offset & htons(IPV4_HDR_DF_FLAG))) { + && (iph->fragment_offset & htons(RTE_IPV4_HDR_DF_FLAG))) { RTE_LOG(DEBUG, IPVS, "%s: frag needed.\n", __func__); icmp_send(mbuf, ICMP_DEST_UNREACH, ICMP_UNREACH_NEEDFRAG, htonl(rt->mtu)); @@ -1799,7 +1799,7 @@ static int __dp_vs_xmit_nat4(struct dp_vs_proto *proto, goto errout; } - mbuf->userdata = rt; + MBUF_USERDATA(mbuf, struct route_entry *, MBUF_FIELD_ROUTE) = rt; /* after route lookup and before translation */ if (xmit_ttl) { @@ -1852,10 +1852,10 @@ static int __dp_vs_xmit_nat6(struct dp_vs_proto *proto, * drop old route. just for safe, because * NAT is PREROUTING, should not have route. */ - if (unlikely(mbuf->userdata != NULL)) { - RTE_LOG(WARNING, IPVS, "%s: NAT have route %p ?\n", - __func__, mbuf->userdata); - route6_put((struct route6*)mbuf->userdata); + if (unlikely(MBUF_USERDATA(mbuf, struct route6 *, MBUF_FIELD_ROUTE) != NULL)) { + RTE_LOG(WARNING, IPVS, "%s: NAT have route %p ?\n", __func__, + MBUF_USERDATA(mbuf, struct route6 *, MBUF_FIELD_ROUTE)); + route6_put(MBUF_USERDATA(mbuf, struct route6 *, MBUF_FIELD_ROUTE)); } memset(&fl6, 0, sizeof(struct flow6)); @@ -1877,7 +1877,7 @@ static int __dp_vs_xmit_nat6(struct dp_vs_proto *proto, goto errout; } - mbuf->userdata = rt6; + MBUF_USERDATA(mbuf, struct route6 *, MBUF_FIELD_ROUTE) = rt6; /* after route lookup and before translation */ if (xmit_ttl) { @@ -1927,7 +1927,7 @@ static int __dp_vs_out_xmit_nat4(struct dp_vs_proto *proto, struct rte_mbuf *mbuf) { struct flow4 fl4; - struct ipv4_hdr *iph = ip4_hdr(mbuf); + struct rte_ipv4_hdr *iph = ip4_hdr(mbuf); struct route_entry *rt; int err, mtu; @@ -1942,10 +1942,10 @@ static int __dp_vs_out_xmit_nat4(struct dp_vs_proto *proto, * drop old route. just for safe, because * NAT is PREROUTING, should not have route. */ - if (unlikely(mbuf->userdata != NULL)) { - RTE_LOG(WARNING, IPVS, "%s: NAT have route %p ?\n", - __func__, mbuf->userdata); - route4_put((struct route_entry*)mbuf->userdata); + if (unlikely(MBUF_USERDATA(mbuf, struct route_entry *, MBUF_FIELD_ROUTE) != NULL)) { + RTE_LOG(WARNING, IPVS, "%s: NAT have route %p ?\n", __func__, + MBUF_USERDATA(mbuf, struct route_entry *, MBUF_FIELD_ROUTE)); + route4_put(MBUF_USERDATA(mbuf, struct route_entry *, MBUF_FIELD_ROUTE)); } memset(&fl4, 0, sizeof(struct flow4)); @@ -1962,7 +1962,7 @@ static int __dp_vs_out_xmit_nat4(struct dp_vs_proto *proto, mtu = rt->mtu; if (mbuf->pkt_len > mtu - && (iph->fragment_offset & htons(IPV4_HDR_DF_FLAG))) { + && (iph->fragment_offset & htons(RTE_IPV4_HDR_DF_FLAG))) { RTE_LOG(DEBUG, IPVS, "%s: frag needed.\n", __func__); icmp_send(mbuf, ICMP_DEST_UNREACH, ICMP_UNREACH_NEEDFRAG, htonl(rt->mtu)); @@ -1971,7 +1971,7 @@ static int __dp_vs_out_xmit_nat4(struct dp_vs_proto *proto, goto errout; } - mbuf->userdata = rt; + MBUF_USERDATA(mbuf, struct route_entry *, MBUF_FIELD_ROUTE) = rt; /* after route lookup and before translation */ if (xmit_ttl) { @@ -2024,10 +2024,10 @@ static int __dp_vs_out_xmit_nat6(struct dp_vs_proto *proto, * drop old route. just for safe, because * NAT is PREROUTING, should not have route. */ - if (unlikely(mbuf->userdata != NULL)) { - RTE_LOG(WARNING, IPVS, "%s: NAT have route %p ?\n", - __func__, mbuf->userdata); - route6_put((struct route6*)mbuf->userdata); + if (unlikely(MBUF_USERDATA(mbuf, struct route6 *, MBUF_FIELD_ROUTE) != NULL)) { + RTE_LOG(WARNING, IPVS, "%s: NAT have route %p ?\n", __func__, + MBUF_USERDATA(mbuf, struct route6 *, MBUF_FIELD_ROUTE)); + route6_put(MBUF_USERDATA(mbuf, struct route6 *, MBUF_FIELD_ROUTE)); } memset(&fl6, 0, sizeof(struct flow6)); @@ -2049,7 +2049,7 @@ static int __dp_vs_out_xmit_nat6(struct dp_vs_proto *proto, goto errout; } - mbuf->userdata = rt6; + MBUF_USERDATA(mbuf, struct route6 *, MBUF_FIELD_ROUTE) = rt6; /* after route lookup and before translation */ if (xmit_ttl) { @@ -2103,20 +2103,20 @@ static int __dp_vs_xmit_tunnel4(struct dp_vs_proto *proto, struct rte_mbuf *mbuf) { struct flow4 fl4; - struct ipv4_hdr *new_iph, *old_iph = ip4_hdr(mbuf); + struct rte_ipv4_hdr *new_iph, *old_iph = ip4_hdr(mbuf); struct route_entry *rt; uint8_t tos = old_iph->type_of_service; - uint16_t df = old_iph->fragment_offset & htons(IPV4_HDR_DF_FLAG); + uint16_t df = old_iph->fragment_offset & htons(RTE_IPV4_HDR_DF_FLAG); int err, mtu; /* * drop old route. just for safe, because * TUNNEL is PREROUTING, should not have route. */ - if (unlikely(mbuf->userdata != NULL)) { - RTE_LOG(WARNING, IPVS, "%s: TUNNEL have route %p ?\n", - __func__, mbuf->userdata); - route4_put((struct route_entry*)mbuf->userdata); + if (unlikely(MBUF_USERDATA(mbuf, struct route_entry *, MBUF_FIELD_ROUTE) != NULL)) { + RTE_LOG(WARNING, IPVS, "%s: TUNNEL have route %p ?\n", __func__, + MBUF_USERDATA(mbuf, struct route_entry *, MBUF_FIELD_ROUTE)); + route4_put(MBUF_USERDATA(mbuf, struct route_entry *, MBUF_FIELD_ROUTE)); } memset(&fl4, 0, sizeof(struct flow4)); @@ -2131,9 +2131,9 @@ static int __dp_vs_xmit_tunnel4(struct dp_vs_proto *proto, dp_vs_conn_cache_rt(conn, rt, true); mtu = rt->mtu; - mbuf->userdata = rt; + MBUF_USERDATA(mbuf, struct route_entry *, MBUF_FIELD_ROUTE) = rt; - new_iph = (struct ipv4_hdr*)rte_pktmbuf_prepend(mbuf, sizeof(struct ipv4_hdr)); + new_iph = (struct rte_ipv4_hdr*)rte_pktmbuf_prepend(mbuf, sizeof(struct rte_ipv4_hdr)); if (!new_iph) { RTE_LOG(WARNING, IPVS, "%s: mbuf has not enough headroom" " space for ipvs tunnel\n", __func__); @@ -2149,7 +2149,7 @@ static int __dp_vs_xmit_tunnel4(struct dp_vs_proto *proto, goto errout; } - memset(new_iph, 0, sizeof(struct ipv4_hdr)); + memset(new_iph, 0, sizeof(struct rte_ipv4_hdr)); new_iph->version_ihl = 0x45; new_iph->type_of_service = tos; new_iph->total_length = htons(mbuf->pkt_len); @@ -2194,10 +2194,10 @@ static int __dp_vs_xmit_tunnel6(struct dp_vs_proto *proto, * drop old route. just for safe, because * TUNNEL is PREROUTING, should not have route. */ - if (unlikely(mbuf->userdata != NULL)) { - RTE_LOG(WARNING, IPVS, "%s: TUNNEL have route %p ?\n", - __func__, mbuf->userdata); - route6_put((struct route6*)mbuf->userdata); + if (MBUF_USERDATA(mbuf, struct route6 *, MBUF_FIELD_ROUTE) != NULL) { + RTE_LOG(WARNING, IPVS, "%s: TUNNEL have route %p ?\n", __func__, + MBUF_USERDATA(mbuf, struct route6 *, MBUF_FIELD_ROUTE)); + route6_put(MBUF_USERDATA(mbuf, struct route6 *, MBUF_FIELD_ROUTE)); } memset(&fl6, 0, sizeof(struct flow6)); @@ -2211,7 +2211,7 @@ static int __dp_vs_xmit_tunnel6(struct dp_vs_proto *proto, dp_vs_conn_cache_rt6(conn, rt6, true); mtu = rt6->rt6_mtu; - mbuf->userdata = rt6; + MBUF_USERDATA(mbuf, struct route6 *, MBUF_FIELD_ROUTE) = rt6; new_ip6h = (struct ip6_hdr*)rte_pktmbuf_prepend(mbuf, sizeof(struct ip6_hdr)); if (!new_ip6h) { @@ -2262,17 +2262,17 @@ static int __dp_vs_xmit_tunnel_6o4(struct dp_vs_proto *proto, int err, mtu; struct flow4 fl4; struct route_entry *rt; - struct ipv4_hdr *new_iph; + struct rte_ipv4_hdr *new_iph; struct ip6_hdr *old_ip6h = ip6_hdr(mbuf); /* * drop old route. just for safe, because * TUNNEL is PREROUTING, should not have route. */ - if (unlikely(mbuf->userdata != NULL)) { - RTE_LOG(WARNING, IPVS, "%s: TUNNEL have route %p ?\n", - __func__, mbuf->userdata); - route6_put((struct route6*)mbuf->userdata); + if (unlikely(MBUF_USERDATA(mbuf, struct route6 *, MBUF_FIELD_ROUTE) != NULL)) { + RTE_LOG(WARNING, IPVS, "%s: TUNNEL have route %p ?\n", __func__, + MBUF_USERDATA(mbuf, struct route6 *, MBUF_FIELD_ROUTE)); + route6_put(MBUF_USERDATA(mbuf, struct route6 *, MBUF_FIELD_ROUTE)); } memset(&fl4, 0, sizeof(struct flow4)); @@ -2287,9 +2287,9 @@ static int __dp_vs_xmit_tunnel_6o4(struct dp_vs_proto *proto, dp_vs_conn_cache_rt(conn, rt, true); mtu = rt->mtu; - mbuf->userdata = rt; + MBUF_USERDATA(mbuf, struct route_entry *, MBUF_FIELD_ROUTE) = rt; - new_iph = (struct ipv4_hdr*)rte_pktmbuf_prepend(mbuf, sizeof(struct ipv4_hdr)); + new_iph = (struct rte_ipv4_hdr*)rte_pktmbuf_prepend(mbuf, sizeof(struct rte_ipv4_hdr)); if (!new_iph) { RTE_LOG(WARNING, IPVS, "%s: mbuf has not enough headroom" " space for ipvs tunnel\n", __func__); @@ -2304,11 +2304,11 @@ static int __dp_vs_xmit_tunnel_6o4(struct dp_vs_proto *proto, goto errout; } - memset(new_iph, 0, sizeof(struct ipv4_hdr)); + memset(new_iph, 0, sizeof(struct rte_ipv4_hdr)); new_iph->version_ihl = 0x45; new_iph->type_of_service = 0; new_iph->total_length = htons(mbuf->pkt_len); - new_iph->fragment_offset = htons(IPV4_HDR_DF_FLAG); + new_iph->fragment_offset = htons(RTE_IPV4_HDR_DF_FLAG); new_iph->time_to_live = old_ip6h->ip6_hlim; new_iph->next_proto_id = IPPROTO_IPV6; new_iph->src_addr = rt->src.s_addr; diff --git a/src/kni.c b/src/kni.c index 15b618904..475f2fdcf 100644 --- a/src/kni.c +++ b/src/kni.c @@ -55,6 +55,14 @@ static void kni_fill_conf(const struct netif_port *dev, const char *ifname, conf->group_id = dev->id; conf->mbuf_size = KNI_DEF_MBUF_SIZE; + /* + * kni device should use same mac as real device, + * because it may config same IP of real device. + * diff mac means kni cannot accept packets sent + * to real-device. + */ + memcpy(conf->mac_addr, dev->addr.addr_bytes, sizeof(conf->mac_addr)); + if (dev->type == PORT_TYPE_GENERAL) { /* dpdk phy device */ rte_eth_dev_info_get(dev->id, &info); #if RTE_VERSION < RTE_VERSION_NUM(18, 11, 0, 0) @@ -83,17 +91,17 @@ static void kni_fill_conf(const struct netif_port *dev, const char *ifname, } static int kni_mc_list_cmp_set(struct netif_port *dev, - struct ether_addr *addrs, size_t naddr) + struct rte_ether_addr *addrs, size_t naddr) { int err = EDPVS_INVAL, i, j; - struct ether_addr addrs_old[NETIF_MAX_HWADDR]; + struct rte_ether_addr addrs_old[NETIF_MAX_HWADDR]; size_t naddr_old; char mac[64]; struct mc_change_list { - size_t naddr; - struct ether_addr addrs[NETIF_MAX_HWADDR*2]; + size_t naddr; + struct rte_ether_addr addrs[NETIF_MAX_HWADDR*2]; /* state: 0 - unchanged, 1 - added, 2 deleted. */ - int states[NETIF_MAX_HWADDR*2]; + int states[NETIF_MAX_HWADDR*2]; } chg_lst = {0}; rte_rwlock_write_lock(&dev->dev_lock); @@ -116,7 +124,7 @@ static int kni_mc_list_cmp_set(struct netif_port *dev, /* add all addrs from netlink(linux) to change-list and * assume they're all new added by default. */ for (i = 0; i < naddr; i++) { - ether_addr_copy(&addrs[i], &chg_lst.addrs[i]); + rte_ether_addr_copy(&addrs[i], &chg_lst.addrs[i]); chg_lst.states[i] = 1; RTE_LOG(DEBUG, Kni, " new [%02d] %s\n", i, @@ -140,7 +148,7 @@ static int kni_mc_list_cmp_set(struct netif_port *dev, /* deleted */ assert(chg_lst.naddr < NETIF_MAX_HWADDR * 2); - ether_addr_copy(&addrs_old[i], &chg_lst.addrs[chg_lst.naddr]); + rte_ether_addr_copy(&addrs_old[i], &chg_lst.addrs[chg_lst.naddr]); chg_lst.states[chg_lst.naddr] = 2; chg_lst.naddr++; } @@ -188,7 +196,7 @@ static int kni_update_maddr(struct netif_port *dev) char line[1024]; int ifindex, users, st; /* @st for static */ char ifname[IFNAMSIZ], hexa[256]; /* hex address */ - struct ether_addr ma_list[NETIF_MAX_HWADDR]; + struct rte_ether_addr ma_list[NETIF_MAX_HWADDR]; int n_ma; fp = fopen("/proc/net/dev_mcast", "r"); @@ -361,20 +369,6 @@ int kni_add_dev(struct netif_port *dev, const char *kniname) return err; } - /* - * kni device should use same mac as real device, - * because it may config same IP of real device. - * diff mac means kni cannot accept packets sent - * to real-device. - */ - err = linux_set_if_mac(conf.name, (unsigned char *)&dev->addr); - if (err != EDPVS_OK) { - char mac[18]; - ether_format_addr(mac, sizeof(mac), &dev->addr); - RTE_LOG(WARNING, Kni, "%s: fail to set mac %s for %s: %s\n", - __func__, mac, conf.name, strerror(errno)); - } - snprintf(ring_name, sizeof(ring_name), "kni_rx_ring_%s", conf.name); rb = rte_ring_create(ring_name, KNI_DEF_MBUF_SIZE, diff --git a/src/log.c b/src/log.c index 6e45964e7..e0b9dd2d7 100644 --- a/src/log.c +++ b/src/log.c @@ -34,7 +34,6 @@ lcoreid_t g_dpvs_log_core = 0; log_stats_t log_stats_info[DPVS_MAX_LCORE]; struct rte_ring *log_ring; bool g_dpvs_log_async_mode = 0; -extern struct rte_logs rte_logs; static struct rte_mempool *dp_vs_log_pool; static int log_pool_size = DPVS_LOG_POOL_SIZE_DEF; static int log_pool_cache = DPVS_LOG_CACHE_SIZE_DEF; @@ -194,7 +193,7 @@ int dpvs_log(uint32_t level, uint32_t logtype, const char *func, int line, const int len = 0; int off = g_dpvs_log_time_off; - if (level > rte_logs.level) + if (level > rte_log_get_global_level()) return -1; va_start(ap, format); @@ -261,7 +260,7 @@ static int log_slave_process(void) { struct dpvs_log *msg_log; int ret = EDPVS_OK; - FILE *f = rte_logs.file; + FILE *f = rte_log_get_stream(); /* dequeue LOG from ring, no lock for ring and w_buf */ while (0 == rte_ring_dequeue(log_ring, (void **)&msg_log)) { @@ -297,7 +296,7 @@ static void log_signal_handler(int signum) signum); } log_slave_process(); - log_buf_flush(rte_logs.file); + log_buf_flush(rte_log_get_stream()); signal(signum, SIG_DFL); kill(getpid(), signum); } @@ -306,14 +305,14 @@ static int __log_slave_init(void) { char ring_name[16]; int lcore_id; - FILE *f = rte_logs.file; + FILE *f = rte_log_get_stream(); char log_pool_name[32]; if (f != NULL) { g_dpvs_log_time_off = LOG_SYS_TIME_LEN; } - RTE_LCORE_FOREACH_SLAVE(lcore_id) { + RTE_LCORE_FOREACH_WORKER(lcore_id) { if (rte_eal_get_lcore_state(lcore_id) == FINISHED) { rte_eal_wait_lcore(lcore_id); dpvs_log_thread_lcore_set(lcore_id); diff --git a/src/main.c b/src/main.c index bff5caca7..0efc4728e 100644 --- a/src/main.c +++ b/src/main.c @@ -59,6 +59,13 @@ extern bool g_dpvs_pdump; extern int log_slave_init(void); +static void inline dpdk_version_check(void) +{ +#if RTE_VERSION < RTE_VERSION_NUM(20, 11, 1, 0) + rte_panic("The current DPVS needs dpdk-stable-20.11.1 or higher. " + "Try old releases if you are using earlier dpdk versions."); +#endif +} /* * the initialization order of all the modules @@ -68,6 +75,8 @@ extern int log_slave_init(void); dpvs_scheduler_init, dpvs_scheduler_term), \ DPVS_MODULE(MODULE_GLOBAL_DATA, "global data", \ global_data_init, global_data_term), \ + DPVS_MODULE(MODULE_MBUF, "mbuf", \ + mbuf_init, NULL), \ DPVS_MODULE(MODULE_CFG, "config file", \ cfgfile_init, cfgfile_term), \ DPVS_MODULE(MODULE_PDUMP, "pdump", \ @@ -251,6 +260,8 @@ int main(int argc, char *argv[]) char pql_conf_buf[LCORE_CONF_BUFFER_LEN]; int pql_conf_buf_len = LCORE_CONF_BUFFER_LEN; + dpdk_version_check(); + /** * add application agruments parse before EAL ones. * use it like the following: diff --git a/src/mbuf.c b/src/mbuf.c index a2f1b8e3a..444082e2d 100644 --- a/src/mbuf.c +++ b/src/mbuf.c @@ -21,6 +21,7 @@ * it includes some mbuf related functions beyond dpdk mbuf API. */ #include +#include #include "mbuf.h" #include "inet.h" #include "ipv4.h" @@ -29,6 +30,19 @@ #define EMBUF #define RTE_LOGTYPE_EMBUF RTE_LOGTYPE_USER1 +#define MBUF_DYNFIELDS_MAX 8 +static int mbuf_dynfields_offset[MBUF_DYNFIELDS_MAX]; + +void *mbuf_userdata(struct rte_mbuf *mbuf, mbuf_usedata_field_t field) +{ + return (void *)mbuf + mbuf_dynfields_offset[field]; +} + +void *mbuf_userdata_const(const struct rte_mbuf *mbuf, mbuf_usedata_field_t field) +{ + return (void *)mbuf + mbuf_dynfields_offset[field]; +} + /** * mbuf_may_pull - pull bits from segments to heading mbuf if needed. * see pskb_may_pull() && __pskb_pull_tail(). @@ -107,7 +121,7 @@ void mbuf_copy_metadata(struct rte_mbuf *mi, struct rte_mbuf *m) mi->nb_segs = 1; mi->ol_flags = m->ol_flags & (~IND_ATTACHED_MBUF); mi->packet_type = m->packet_type; - mi->userdata = NULL; + mbuf_userdata_reset(mi); __rte_mbuf_sanity_check(mi, 1); __rte_mbuf_sanity_check(m, 0); @@ -153,7 +167,7 @@ inline void dp_vs_mbuf_dump(const char *msg, int af, const struct rte_mbuf *mbuf { char stime[SYS_TIME_STR_LEN]; char sbuf[64], dbuf[64]; - struct ipv4_hdr *iph; + struct rte_ipv4_hdr *iph; union inet_addr saddr, daddr; __be16 _ports[2], *ports; @@ -176,3 +190,34 @@ inline void dp_vs_mbuf_dump(const char *msg, int af, const struct rte_mbuf *mbuf ntohs(ports[1])); } #endif + +int mbuf_init(void) +{ + int i, offset; + + const struct rte_mbuf_dynfield rte_mbuf_userdata_fields[] = { + [ MBUF_FIELD_PROTO ] = { + .name = "protocol", + .size = sizeof(mbuf_userdata_field_proto_t), + .align = 8, + }, + [ MBUF_FIELD_ROUTE ] = { + .name = "route", + .size = sizeof(mbuf_userdata_field_route_t), + .align = 8, + }, + }; + + for (i = 0; i < NELEMS(rte_mbuf_userdata_fields); i++) { + if (rte_mbuf_userdata_fields[i].size == 0) + continue; + offset = rte_mbuf_dynfield_register(&rte_mbuf_userdata_fields[i]); + if (offset < 0) { + RTE_LOG(ERR, MBUF, "fail to register dynfield[%d] in mbuf!\n", i); + return EDPVS_NOROOM; + } + mbuf_dynfields_offset[i] = offset; + } + + return EDPVS_OK; +} diff --git a/src/mempool.c b/src/mempool.c index e3d5bdac5..ca852b4bc 100644 --- a/src/mempool.c +++ b/src/mempool.c @@ -88,7 +88,7 @@ struct dpvs_mempool *dpvs_mempool_create(char *name, uint32_t obj_num; struct dpvs_mempool *mp; - if (rte_lcore_id() != rte_get_master_lcore()) { + if (rte_lcore_id() != rte_get_main_lcore()) { RTE_LOG(WARNING, DPVS_MPOOL, "%s could be called on master lcore only!", __func__); return NULL; } @@ -152,7 +152,7 @@ void dpvs_mempool_destroy(struct dpvs_mempool *mp) if (unlikely(!mp)) return; - if (rte_lcore_id() != rte_get_master_lcore()) { + if (rte_lcore_id() != rte_get_main_lcore()) { RTE_LOG(WARNING, DPVS_MPOOL, "%s could be called on master lcore only!", __func__); return; } diff --git a/src/neigh.c b/src/neigh.c index 35130e8bb..80341b763 100644 --- a/src/neigh.c +++ b/src/neigh.c @@ -51,12 +51,12 @@ struct neighbour_mbuf_entry { } __rte_cache_aligned; struct raw_neigh { - int af; - union inet_addr ip_addr; - struct ether_addr eth_addr; - struct netif_port *port; - bool add; - uint8_t flag; + int af; + union inet_addr ip_addr; + struct rte_ether_addr eth_addr; + struct netif_port *port; + bool add; + uint8_t flag; } __rte_cache_aligned; struct nud_state { @@ -147,12 +147,12 @@ static lcoreid_t master_cid = 0; static struct list_head neigh_table[DPVS_MAX_LCORE][NEIGH_TAB_SIZE]; -static struct raw_neigh* neigh_ring_clone_entry(const struct neighbour_entry* neighbour, +static struct raw_neigh *neigh_ring_clone_entry(const struct neighbour_entry *neighbour, bool add); static int neigh_send_arp(struct netif_port *port, uint32_t src_ip, uint32_t dst_ip); -static inline char *eth_addr_itoa(const struct ether_addr *src, char *dst, size_t size) +static inline char *eth_addr_itoa(const struct rte_ether_addr *src, char *dst, size_t size) { snprintf(dst, size, "%02x:%02x:%02x:%02x:%02x:%02x", src->addr_bytes[0], @@ -165,18 +165,18 @@ static inline char *eth_addr_itoa(const struct ether_addr *src, char *dst, size_ } #ifdef CONFIG_DPVS_NEIGH_DEBUG -static void dump_arp_hdr(const char *msg, const struct arp_hdr *ah, portid_t port) +static void dump_arp_hdr(const char *msg, const struct rte_arp_hdr *ah, portid_t port) { - const struct arp_ipv4 *aip4; + const struct rte_arp_ipv4 *aip4; char sha[18], tha[18]; char sip[16], tip[16]; lcoreid_t lcore; lcore = rte_lcore_id(); fprintf(stderr, "%s lcore %d port%d arp hlen %u plen %u op %u", - msg ? msg : "", lcore, port, ah->arp_hln, ah->arp_pln, ntohs(ah->arp_op)); + msg ? msg : "", lcore, port, ah->arp_hlen, ah->arp_plen, ntohs(ah->arp_opcode)); - if (ah->arp_pro == htons(ETHER_TYPE_IPv4)) { + if (ah->arp_protocol == htons(RTE_ETHER_TYPE_IPV4)) { aip4 = &ah->arp_data; eth_addr_itoa(&aip4->arp_sha, sha, sizeof(sha)); eth_addr_itoa(&aip4->arp_tha, tha, sizeof(tha)); @@ -245,7 +245,7 @@ static inline int neigh_unhash(struct neighbour_entry *neighbour) } static inline bool neigh_key_cmp(int af, const struct neighbour_entry *neighbour, - const union inet_addr *key, const struct netif_port* port) + const union inet_addr *key, const struct netif_port *port) { return (inet_addr_equal(af, key, &neighbour->ip_addr)) && @@ -333,7 +333,7 @@ static int neighbour_timer_event(void *data) } struct neighbour_entry *neigh_lookup_entry(int af, const union inet_addr *key, - const struct netif_port* port, + const struct netif_port *port, unsigned int hashkey) { struct neighbour_entry *neighbour; @@ -347,7 +347,7 @@ struct neighbour_entry *neigh_lookup_entry(int af, const union inet_addr *key, return NULL; } -int neigh_edit(struct neighbour_entry *neighbour, struct ether_addr *eth_addr) +int neigh_edit(struct neighbour_entry *neighbour, struct rte_ether_addr *eth_addr) { rte_memcpy(&neighbour->eth_addr, eth_addr, 6); @@ -355,7 +355,7 @@ int neigh_edit(struct neighbour_entry *neighbour, struct ether_addr *eth_addr) } struct neighbour_entry *neigh_add_table(int af, const union inet_addr *ipaddr, - const struct ether_addr *eth_addr, + const struct rte_ether_addr *eth_addr, struct netif_port *port, unsigned int hashkey, int flag) { @@ -415,21 +415,21 @@ static void neigh_fill_mac(struct neighbour_entry *neighbour, const struct in6_addr *target, struct netif_port *port) { - struct ether_hdr *eth; - struct ether_addr mult_eth; + struct rte_ether_hdr *eth; + struct rte_ether_addr mult_eth; uint16_t pkt_type; - m->l2_len = sizeof(struct ether_hdr); - eth = (struct ether_hdr *)rte_pktmbuf_prepend(m, (uint16_t)sizeof(struct ether_hdr)); + m->l2_len = sizeof(struct rte_ether_hdr); + eth = (struct rte_ether_hdr *)rte_pktmbuf_prepend(m, (uint16_t)sizeof(struct rte_ether_hdr)); if (!neighbour && target) { ipv6_mac_mult(target, &mult_eth); - ether_addr_copy(&mult_eth, ð->d_addr); + rte_ether_addr_copy(&mult_eth, ð->d_addr); } else { - ether_addr_copy(&neighbour->eth_addr, ð->d_addr); + rte_ether_addr_copy(&neighbour->eth_addr, ð->d_addr); } - ether_addr_copy(&port->addr, ð->s_addr); + rte_ether_addr_copy(&port->addr, ð->s_addr); pkt_type = (uint16_t)m->packet_type; eth->ether_type = rte_cpu_to_be_16(pkt_type); } @@ -494,39 +494,39 @@ static void neigh_state_confirm(struct neighbour_entry *neighbour) int neigh_resolve_input(struct rte_mbuf *m, struct netif_port *port) { - struct arp_hdr *arp = rte_pktmbuf_mtod(m, struct arp_hdr *); - struct ether_hdr *eth; + struct rte_arp_hdr *arp = rte_pktmbuf_mtod(m, struct rte_arp_hdr *); + struct rte_ether_hdr *eth; uint32_t ipaddr; struct neighbour_entry *neighbour = NULL; unsigned int hashkey; struct inet_ifaddr *ifa; - ifa = inet_addr_ifa_get(AF_INET, port, (union inet_addr*)&arp->arp_data.arp_tip); + ifa = inet_addr_ifa_get(AF_INET, port, (union inet_addr *)&arp->arp_data.arp_tip); if (!ifa) return EDPVS_KNICONTINUE; inet_addr_ifa_put(ifa); - eth = (struct ether_hdr *)rte_pktmbuf_prepend(m, - (uint16_t)sizeof(struct ether_hdr)); + eth = (struct rte_ether_hdr *)rte_pktmbuf_prepend(m, + (uint16_t)sizeof(struct rte_ether_hdr)); - if (rte_be_to_cpu_16(arp->arp_op) == ARP_OP_REQUEST) { - ether_addr_copy(ð->s_addr, ð->d_addr); + if (rte_be_to_cpu_16(arp->arp_opcode) == RTE_ARP_OP_REQUEST) { + rte_ether_addr_copy(ð->s_addr, ð->d_addr); rte_memcpy(ð->s_addr, &port->addr, 6); - arp->arp_op = rte_cpu_to_be_16(ARP_OP_REPLY); + arp->arp_opcode = rte_cpu_to_be_16(RTE_ARP_OP_REPLY); - ether_addr_copy(&arp->arp_data.arp_sha, &arp->arp_data.arp_tha); - ether_addr_copy(ð->s_addr, &arp->arp_data.arp_sha); + rte_ether_addr_copy(&arp->arp_data.arp_sha, &arp->arp_data.arp_tha); + rte_ether_addr_copy(ð->s_addr, &arp->arp_data.arp_sha); ipaddr = arp->arp_data.arp_sip; arp->arp_data.arp_sip = arp->arp_data.arp_tip; arp->arp_data.arp_tip = ipaddr; - m->l2_len = sizeof(struct ether_hdr); - m->l3_len = sizeof(struct arp_hdr); + m->l2_len = sizeof(struct rte_ether_hdr); + m->l3_len = sizeof(struct rte_arp_hdr); netif_xmit(m, port); return EDPVS_OK; - } else if (arp->arp_op == htons(ARP_OP_REPLY)) { + } else if (arp->arp_opcode == htons(RTE_ARP_OP_REPLY)) { ipaddr = arp->arp_data.arp_sip; hashkey = neigh_hashkey(AF_INET, (union inet_addr *)&ipaddr, port); neighbour = neigh_lookup_entry(AF_INET, (union inet_addr *)&ipaddr, @@ -554,8 +554,8 @@ int neigh_resolve_input(struct rte_mbuf *m, struct netif_port *port) static int neigh_send_arp(struct netif_port *port, uint32_t src_ip, uint32_t dst_ip) { struct rte_mbuf *m; - struct ether_hdr *eth; - struct arp_hdr *arp; + struct rte_ether_hdr *eth; + struct rte_arp_hdr *arp; uint32_t addr; @@ -563,16 +563,16 @@ static int neigh_send_arp(struct netif_port *port, uint32_t src_ip, uint32_t dst if (unlikely(m == NULL)) { return EDPVS_NOMEM; } - m->userdata = NULL; + mbuf_userdata_reset(m); - eth = rte_pktmbuf_mtod(m, struct ether_hdr *); - arp = (struct arp_hdr *)ð[1]; + eth = rte_pktmbuf_mtod(m, struct rte_ether_hdr *); + arp = (struct rte_arp_hdr *)ð[1]; memset(ð->d_addr, 0xFF, 6); - ether_addr_copy(&port->addr, ð->s_addr); - eth->ether_type = htons(ETHER_TYPE_ARP); + rte_ether_addr_copy(&port->addr, ð->s_addr); + eth->ether_type = htons(RTE_ETHER_TYPE_ARP); - memset(arp, 0, sizeof(struct arp_hdr)); + memset(arp, 0, sizeof(struct rte_arp_hdr)); rte_memcpy(&arp->arp_data.arp_sha, &port->addr, 6); addr = src_ip; inetAddrCopy(&arp->arp_data.arp_sip, &addr); @@ -581,15 +581,15 @@ static int neigh_send_arp(struct netif_port *port, uint32_t src_ip, uint32_t dst addr = dst_ip; inetAddrCopy(&arp->arp_data.arp_tip, &addr); - arp->arp_hrd = htons(ARP_HRD_ETHER); - arp->arp_pro = htons(ETHER_TYPE_IPv4); - arp->arp_hln = 6; - arp->arp_pln = 4; - arp->arp_op = htons(ARP_OP_REQUEST); - m->pkt_len = 60; - m->data_len = 60; - m->l2_len = sizeof(struct ether_hdr); - m->l3_len = sizeof(struct arp_hdr); + arp->arp_hardware = htons(RTE_ARP_HRD_ETHER); + arp->arp_protocol = htons(RTE_ETHER_TYPE_IPV4); + arp->arp_hlen = 6; + arp->arp_plen = 4; + arp->arp_opcode = htons(RTE_ARP_OP_REQUEST); + m->pkt_len = 60; + m->data_len = 60; + m->l2_len = sizeof(struct rte_ether_hdr); + m->l3_len = sizeof(struct rte_arp_hdr); memset(&arp[1], 0, 18); @@ -735,7 +735,7 @@ int neigh_gratuitous_arp(struct in_addr *src_ip, struct netif_port *port) } static struct pkt_type arp_pkt_type = { - //.type = rte_cpu_to_be_16(ETHER_TYPE_ARP), + //.type = rte_cpu_to_be_16(RTE_ETHER_TYPE_ARP), .func = neigh_resolve_input, .port = NULL, }; @@ -760,10 +760,10 @@ static int neigh_ring_init(void) return EDPVS_OK; } -static struct raw_neigh* neigh_ring_clone_entry(const struct neighbour_entry* neighbour, +static struct raw_neigh *neigh_ring_clone_entry(const struct neighbour_entry *neighbour, bool add) { - struct raw_neigh* mac_param; + struct raw_neigh *mac_param; mac_param = dpvs_mempool_get(neigh_mempool, sizeof(struct raw_neigh)); if (unlikely(mac_param == NULL)) @@ -779,11 +779,11 @@ static struct raw_neigh* neigh_ring_clone_entry(const struct neighbour_entry* ne return mac_param; } -static struct raw_neigh* neigh_ring_clone_param(const struct dp_vs_neigh_conf *param, +static struct raw_neigh *neigh_ring_clone_param(const struct dp_vs_neigh_conf *param, bool add) { struct netif_port *port; - struct raw_neigh* mac_param; + struct raw_neigh *mac_param; mac_param = dpvs_mempool_get(neigh_mempool, sizeof(struct raw_neigh)); if (unlikely(mac_param == NULL)) @@ -874,7 +874,7 @@ static void neigh_fill_param(struct dp_vs_neigh_conf *param, param->af = entry->af; param->ip_addr = entry->ip_addr; param->flag = entry->flag; - ether_addr_copy(&entry->eth_addr, ¶m->eth_addr); + rte_ether_addr_copy(&entry->eth_addr, ¶m->eth_addr); param->que_num = entry->que_num; param->state = entry->state; param->cid = cid; @@ -1132,7 +1132,7 @@ static int arp_init(void) master_cid = rte_lcore_id(); - arp_pkt_type.type = rte_cpu_to_be_16(ETHER_TYPE_ARP); + arp_pkt_type.type = rte_cpu_to_be_16(RTE_ETHER_TYPE_ARP); if ((err = netif_register_pkt(&arp_pkt_type)) != EDPVS_OK) return err; if ((err = sockopt_register(&neigh_sockopts)) != EDPVS_OK) diff --git a/src/netif.c b/src/netif.c index b77cb4ef3..80d9babdd 100644 --- a/src/netif.c +++ b/src/netif.c @@ -38,6 +38,7 @@ #include "parser/parser.h" #include "neigh.h" #include "scheduler.h" +#include "netif_flow.h" #include #include @@ -84,7 +85,8 @@ static uint16_t g_nports; /*for arp process*/ static struct rte_ring *arp_ring[DPVS_MAX_LCORE]; -#define NETIF_BOND_MODE_DEF BONDING_MODE_ROUND_ROBIN +#define NETIF_BOND_MODE_DEF BONDING_MODE_ROUND_ROBIN +#define NETIF_BOND_NUMA_NODE_DEF 0 struct port_conf_stream { int port_id; @@ -99,22 +101,24 @@ struct port_conf_stream { int tx_queue_nb; int tx_desc_nb; - enum rte_fdir_mode fdir_mode; - enum rte_fdir_pballoc_type fdir_pballoc; - enum rte_fdir_status_mode fdir_status; - bool promisc_mode; struct list_head port_list_node; }; +struct bond_options { + bool dedicated_queues_enable; +}; + struct bond_conf_stream { int port_id; char name[32]; char kni_name[32]; int mode; + int numa_node; char primary[32]; char slaves[NETIF_MAX_BOND_SLAVES][32]; + struct bond_options options; struct list_head bond_list_node; }; @@ -153,14 +157,12 @@ static struct list_head port_ntab[NETIF_PORT_TABLE_BUCKETS]; /* hashed by name * /* function declarations */ static void kni_lcore_loop(void *dummy); -bool dp_vs_fdir_filter_enable = true; - bool is_lcore_id_valid(lcoreid_t cid) { if (unlikely(cid >= DPVS_MAX_LCORE)) return false; - return ((cid == rte_get_master_lcore()) || + return ((cid == rte_get_main_lcore()) || (cid == g_kni_lcore_id) || (g_slave_lcore_mask & (1L << cid)) || (g_isol_rx_lcore_mask & (1L << cid))); @@ -171,7 +173,7 @@ static bool is_lcore_id_fwd(lcoreid_t cid) if (unlikely(cid >= DPVS_MAX_LCORE)) return false; - return ((cid == rte_get_master_lcore()) || + return ((cid == rte_get_main_lcore()) || (g_slave_lcore_mask & (1L << cid))); } @@ -245,6 +247,31 @@ static void pktpool_cache_handler(vector_t tokens) FREE_PTR(str); } +#ifdef CONFIG_DPVS_FDIR +static enum rte_fdir_mode g_fdir_mode = RTE_FDIR_MODE_PERFECT; + +static void fdir_mode_handler(vector_t tokens) +{ + char *mode, *str = set_value(tokens); + + assert(str); + mode = strlwr(str); + + if (!strncmp(mode, "perfect", sizeof("perfect"))) + g_fdir_mode = RTE_FDIR_MODE_PERFECT; + else if (!strncmp(mode, "signature", sizeof("signature"))) + g_fdir_mode = RTE_FDIR_MODE_SIGNATURE; + else { + RTE_LOG(WARNING, NETIF, "invalid fdir_mode %s, using default %s\n", + mode, "perfect"); + g_fdir_mode = RTE_FDIR_MODE_PERFECT; + } + RTE_LOG(INFO, NETIF, "g_fdir_mode = %s\n", mode); + + FREE_PTR(str); +} +#endif + static void device_handler(vector_t tokens) { assert(VECTOR_SIZE(tokens) >= 1); @@ -269,9 +296,6 @@ static void device_handler(vector_t tokens) port_cfg->promisc_mode = false; strncpy(port_cfg->rss, "tcp", sizeof(port_cfg->rss)); - port_cfg->fdir_mode = RTE_FDIR_MODE_PERFECT; - port_cfg->fdir_pballoc = RTE_FDIR_PBALLOC_64K; - port_cfg->fdir_status = RTE_FDIR_REPORT_STATUS; list_add(&port_cfg->port_list_node, &port_list); } @@ -387,120 +411,6 @@ static void tx_desc_nb_handler(vector_t tokens) FREE_PTR(str); } -static void fdir_mode_handler(vector_t tokens) -{ - char *mode, *str = set_value(tokens); - struct port_conf_stream *current_device = list_entry(port_list.next, - struct port_conf_stream, port_list_node); - bool use_default = false; - assert(str); - - mode = strlwr(str); - - if (!strncmp(mode, "none", sizeof("none"))) - current_device->fdir_mode = RTE_FDIR_MODE_NONE; - else if (!strncmp(mode, "signature", sizeof("signature"))) - current_device->fdir_mode = RTE_FDIR_MODE_SIGNATURE; - else if (!strncmp(mode, "perfect", sizeof("perfect"))) - current_device->fdir_mode = RTE_FDIR_MODE_PERFECT; - else if (!strncmp(mode, "perfect_mac_vlan", sizeof("perfect_mac_vlan"))) - current_device->fdir_mode = RTE_FDIR_MODE_PERFECT_MAC_VLAN; - else if (!strncmp(mode, "perfect_tunnel", sizeof("perfect_tunnel"))) - current_device->fdir_mode = RTE_FDIR_MODE_PERFECT_TUNNEL; - else { - use_default = true; - current_device->fdir_mode = RTE_FDIR_MODE_PERFECT; - } - - if (use_default) - RTE_LOG(WARNING, NETIF, "invalid %s:fdir_mode '%s', " - "use default 'perfect'\n", current_device->name, mode); - else - RTE_LOG(INFO, NETIF, "%s:fdir_mode = %s\n", current_device->name, mode); - - FREE_PTR(str); -} - -static void fdir_pballoc_handler(vector_t tokens) -{ - char *pballoc, *str = set_value(tokens); - struct port_conf_stream *current_device = list_entry(port_list.next, - struct port_conf_stream, port_list_node); - bool use_default = false; - assert(str); - - pballoc = strlwr(str); - - if (!strncmp(pballoc, "64k", sizeof("64k"))) - current_device->fdir_pballoc = RTE_FDIR_PBALLOC_64K; - else if (!strncmp(pballoc, "128k", sizeof("128k"))) - current_device->fdir_pballoc = RTE_FDIR_PBALLOC_128K; - else if (!strncmp(pballoc, "256k", sizeof("256k"))) - current_device->fdir_pballoc = RTE_FDIR_PBALLOC_256K; - else { - use_default = true; - current_device->fdir_pballoc = RTE_FDIR_PBALLOC_64K; - } - - if (use_default) - RTE_LOG(WARNING, NETIF, "invalid %s:fdir_pballoc '%s', " - "use default '64k'\n", current_device->name, pballoc); - else - RTE_LOG(INFO, NETIF, "%s:fdir_pballoc = %s\n", - current_device->name, pballoc); - - FREE_PTR(str); -} - -static void fdir_status_handler(vector_t tokens) -{ - char *status, *str = set_value(tokens); - struct port_conf_stream *current_device = list_entry(port_list.next, - struct port_conf_stream, port_list_node); - bool use_default = false; - assert(str); - - status = strlwr(str); - - if (!strncmp(status, "close", sizeof("close"))) - current_device->fdir_status = RTE_FDIR_NO_REPORT_STATUS; - else if (!strncmp(status, "matched", sizeof("matched"))) - current_device->fdir_status = RTE_FDIR_REPORT_STATUS; - else if (!strncmp(status, "always", sizeof("always"))) - current_device->fdir_status = RTE_FDIR_REPORT_STATUS_ALWAYS; - else { - use_default = true; - current_device->fdir_status = RTE_FDIR_REPORT_STATUS; - } - - if (use_default) - RTE_LOG(WARNING, NETIF, "invalid %s:fdir_status '%s', " - "use default 'matched'\n", current_device->name, status); - else - RTE_LOG(INFO, NETIF, "%s:fdir_status = %s\n", - current_device->name, status); - - FREE_PTR(str); -} - -static void fdir_filter_handler(vector_t tokens) -{ - char *str = set_value(tokens); - - assert(str); - - if (strcasecmp(str, "on") == 0) - dp_vs_fdir_filter_enable = true; - else if (strcasecmp(str, "off") == 0) - dp_vs_fdir_filter_enable = false; - else - RTE_LOG(WARNING, IPVS, "invalid fdir:filter %s\n", str); - - RTE_LOG(INFO, IPVS, "fdir:filter = %s\n", dp_vs_fdir_filter_enable ? "on" : "off"); - - FREE_PTR(str); -} - static void promisc_mode_handler(vector_t tokens) { struct port_conf_stream *current_device = list_entry(port_list.next, @@ -537,7 +447,7 @@ static void kni_name_handler(vector_t tokens) struct port_conf_stream, port_list_node); assert(str); - RTE_LOG(INFO, NETIF, "%s: kni_name = %s\n",current_device->name, str); + RTE_LOG(INFO, NETIF, "%s:kni_name = %s\n",current_device->name, str); strncpy(current_device->kni_name, str, sizeof(current_device->kni_name)); FREE_PTR(str); @@ -560,6 +470,8 @@ static void bonding_handler(vector_t tokens) RTE_LOG(INFO, NETIF, "netif bonding config: %s\n", str); strncpy(bond_cfg->name, str, sizeof(bond_cfg->name)); bond_cfg->mode = NETIF_BOND_MODE_DEF; + bond_cfg->numa_node = NETIF_BOND_NUMA_NODE_DEF; + bond_cfg->options.dedicated_queues_enable = true; list_add(&bond_cfg->bond_list_node, &bond_list); } @@ -635,6 +547,27 @@ static void bonding_primary_handler(vector_t tokens) FREE_PTR(str); } +static void bonding_numa_node_handler(vector_t tokens) +{ + char *str = set_value(tokens); + int numa_node; + struct bond_conf_stream *current_bond = list_entry(bond_list.next, + struct bond_conf_stream, bond_list_node); + assert(str); + + numa_node = atoi(str); + if (numa_node >= get_numa_nodes()) { + RTE_LOG(WARNING, NETIF, "invalid bonding %s:numa_node %d, using default %d\n", + current_bond->name, numa_node, NETIF_BOND_NUMA_NODE_DEF); + current_bond->mode = NETIF_BOND_NUMA_NODE_DEF; + } else { + RTE_LOG(INFO, NETIF, "bonding %s:numa_node=%d\n", current_bond->name, numa_node); + current_bond->numa_node = numa_node; + } + + FREE_PTR(str); +} + static void bonding_kni_name_handler(vector_t tokens) { char *str = set_value(tokens); @@ -648,6 +581,59 @@ static void bonding_kni_name_handler(vector_t tokens) FREE_PTR(str); } +static inline char * get_bonding_option_value(char *token) +{ + char *ptr, *saveptr = NULL, *ret = token; + + if (!token) + return NULL; + + for (ptr = token; ret == token; ptr = NULL) + ret = strtok_r(ptr, "=", &saveptr); + + return ret; +} + +static void bonding_options_handler(vector_t tokens) +{ + char *str; + char *opt, *val, *ptr, *saveptr = NULL; + + str = set_value(tokens); + struct bond_conf_stream *current_bond = list_entry(bond_list.next, + struct bond_conf_stream, bond_list_node); + + assert(str); + RTE_LOG(INFO, NETIF, "bonding %s options: %s\n", current_bond->name, str); + + for (ptr = str; ;ptr = NULL) { + opt = strtok_r(ptr, ";", &saveptr); + if (opt == NULL) + break; + val = get_bonding_option_value(opt); + + if (!strcmp(opt, "dedicated_queues")) { + if (current_bond->mode != BONDING_MODE_8023AD || !val) { + RTE_LOG(WARNING, NETIF, "invalid bonding %s mode 4 option: %s, value: %s\n", + current_bond->name, opt, val ?: "null"); + continue; + } + if (!strcasecmp(val, "on") || !strcasecmp(val, "enable")) + current_bond->options.dedicated_queues_enable = true; + else if (!strcasecmp(val, "off") || !strcasecmp(val, "disable")) + current_bond->options.dedicated_queues_enable = false; + else + RTE_LOG(WARNING, NETIF, "invalid bonding %s option value: %s=%s\n", + current_bond->name, opt, val); + } else { + RTE_LOG(WARNING, NETIF, "unsupported bonding %s option: %s\n", + current_bond->name, opt); + } + } + + FREE_PTR(str); +} + static void worker_defs_handler(vector_t tokens) { struct worker_conf_stream *worker_cfg, *worker_cfg_next; @@ -880,6 +866,9 @@ void netif_keyword_value_init(void) /* KW_TYPE_INIT keyword */ netif_pktpool_nb_mbuf = NETIF_PKTPOOL_NB_MBUF_DEF; netif_pktpool_mbuf_cache = NETIF_PKTPOOL_MBUF_CACHE_DEF; +#ifdef CONFIG_DPVS_FDIR + g_fdir_mode = RTE_FDIR_MODE_PERFECT; +#endif } /* KW_TYPE_NORMAL keyword */ } @@ -889,6 +878,9 @@ void install_netif_keywords(void) install_keyword_root("netif_defs", netif_defs_handler); install_keyword("pktpool_size", pktpool_size_handler, KW_TYPE_INIT); install_keyword("pktpool_cache", pktpool_cache_handler, KW_TYPE_INIT); +#ifdef CONFIG_DPVS_FDIR + install_keyword("fdir_mode", fdir_mode_handler, KW_TYPE_INIT); +#endif install_keyword("device", device_handler, KW_TYPE_INIT); install_sublevel(); install_keyword("rx", NULL, KW_TYPE_INIT); @@ -902,13 +894,6 @@ void install_netif_keywords(void) install_keyword("queue_number", tx_queue_number_handler, KW_TYPE_INIT); install_keyword("descriptor_number", tx_desc_nb_handler, KW_TYPE_INIT); install_sublevel_end(); - install_keyword("fdir", NULL, KW_TYPE_INIT); - install_sublevel(); - install_keyword("mode", fdir_mode_handler, KW_TYPE_INIT); - install_keyword("pballoc", fdir_pballoc_handler, KW_TYPE_INIT); - install_keyword("status", fdir_status_handler, KW_TYPE_INIT); - install_keyword("filter", fdir_filter_handler, KW_TYPE_INIT); - install_sublevel_end(); install_keyword("promisc_mode", promisc_mode_handler, KW_TYPE_INIT); install_keyword("mtu", custom_mtu_handler,KW_TYPE_INIT); install_keyword("kni_name", kni_name_handler, KW_TYPE_INIT); @@ -918,7 +903,9 @@ void install_netif_keywords(void) install_keyword("mode", bonding_mode_handler, KW_TYPE_INIT); install_keyword("slave", bonding_slave_handler, KW_TYPE_INIT); install_keyword("primary", bonding_primary_handler, KW_TYPE_INIT); + install_keyword("numa_node", bonding_numa_node_handler, KW_TYPE_INIT); install_keyword("kni_name", bonding_kni_name_handler, KW_TYPE_INIT); + install_keyword("options", bonding_options_handler, KW_TYPE_INIT); install_sublevel_end(); install_keyword_root("worker_defs", worker_defs_handler); @@ -981,24 +968,24 @@ static void netif_cfgfile_term(void) #include static inline int parse_ether_hdr(struct rte_mbuf *mbuf, uint16_t port, uint16_t queue) { - struct ether_hdr *eth_hdr; + struct rte_ether_hdr *eth_hdr; char saddr[18], daddr[18]; - eth_hdr = rte_pktmbuf_mtod(mbuf, struct ether_hdr *); - ether_format_addr(saddr, sizeof(saddr), ð_hdr->s_addr); - ether_format_addr(daddr, sizeof(daddr), ð_hdr->d_addr); + eth_hdr = rte_pktmbuf_mtod(mbuf, struct rte_ether_hdr *); + rte_ether_format_addr(saddr, sizeof(saddr), ð_hdr->s_addr); + rte_ether_format_addr(daddr, sizeof(daddr), ð_hdr->d_addr); RTE_LOG(INFO, NETIF, "[%s] lcore=%u port=%u queue=%u ethtype=%0x saddr=%s daddr=%s\n", __func__, rte_lcore_id(), port, queue, rte_be_to_cpu_16(eth_hdr->ether_type), saddr, daddr); return EDPVS_OK; } -static inline int is_ipv4_pkt_valid(struct ipv4_hdr *iph, uint32_t link_len) +static inline int is_ipv4_pkt_valid(struct rte_ipv4_hdr *iph, uint32_t link_len) { if (((iph->version_ihl) >> 4) != 4) return EDPVS_INVAL; if ((iph->version_ihl & 0xf) < 5) return EDPVS_INVAL; - if (rte_cpu_to_be_16(iph->total_length) < sizeof(struct ipv4_hdr)) + if (rte_cpu_to_be_16(iph->total_length) < sizeof(struct rte_ipv4_hdr)) return EDPVS_INVAL; return EDPVS_OK; } @@ -1007,14 +994,14 @@ static void parse_ipv4_hdr(struct rte_mbuf *mbuf, uint16_t port, uint16_t queue) { char saddr[16], daddr[16]; uint16_t lcore; - struct ipv4_hdr *iph; - struct udp_hdr *uh; + struct rte_ipv4_hdr *iph; + struct rte_udp_hdr *uh; - iph = rte_pktmbuf_mtod_offset(mbuf, struct ipv4_hdr *, sizeof(struct ether_hdr)); + iph = rte_pktmbuf_mtod_offset(mbuf, struct rte_ipv4_hdr *, sizeof(struct rte_ether_hdr)); if (is_ipv4_pkt_valid(iph, mbuf->pkt_len) < 0) return; - uh = rte_pktmbuf_mtod_offset(mbuf, struct udp_hdr *, sizeof(struct ether_hdr) + - (IPV4_HDR_IHL_MASK & iph->version_ihl) * sizeof(uint32_t)); + uh = rte_pktmbuf_mtod_offset(mbuf, struct rte_udp_hdr *, sizeof(struct rte_ether_hdr) + + (RTE_IPV4_HDR_IHL_MASK & iph->version_ihl) * sizeof(uint32_t)); lcore = rte_lcore_id(); if (!inet_ntop(AF_INET, &iph->src_addr, saddr, sizeof(saddr))) @@ -1024,7 +1011,7 @@ static void parse_ipv4_hdr(struct rte_mbuf *mbuf, uint16_t port, uint16_t queue) RTE_LOG(INFO, NETIF, "[%s] lcore=%u port=%u queue=%u ipv4_hl=%u tos=%u tot=%u " "id=%u ttl=%u prot=%u src=%s dst=%s sport=%04x|%u dport=%04x|%u\n", - __func__, lcore, port, queue, IPV4_HDR_IHL_MASK & iph->version_ihl, + __func__, lcore, port, queue, RTE_IPV4_HDR_IHL_MASK & iph->version_ihl, iph->type_of_service, ntohs(iph->total_length), ntohs(iph->packet_id), iph->time_to_live, iph->next_proto_id, saddr, daddr, uh->src_port, ntohs(uh->src_port), uh->dst_port, ntohs(uh->dst_port)); @@ -1033,12 +1020,12 @@ static void parse_ipv4_hdr(struct rte_mbuf *mbuf, uint16_t port, uint16_t queue) __rte_unused static void pkt_send_back(struct rte_mbuf *mbuf, struct netif_port *port) { - struct ether_hdr *ehdr; - struct ether_addr eaddr; - ehdr = rte_pktmbuf_mtod(mbuf, struct ether_hdr*); - ether_addr_copy(&ehdr->s_addr, &eaddr); - ether_addr_copy(&ehdr->d_addr, &ehdr->s_addr); - ether_addr_copy(&eaddr, &ehdr->d_addr); + struct rte_ether_hdr *ehdr; + struct rte_ether_addr eaddr; + ehdr = rte_pktmbuf_mtod(mbuf, struct rte_ether_hdr*); + rte_ether_addr_copy(&ehdr->s_addr, &eaddr); + rte_ether_addr_copy(&ehdr->d_addr, &ehdr->s_addr); + rte_ether_addr_copy(&eaddr, &ehdr->d_addr); netif_xmit(mbuf, port); } #endif @@ -1345,7 +1332,7 @@ static void build_lcore_index(void) { int i, idx = 0; - g_lcore_index[idx++] = rte_get_master_lcore(); + g_lcore_index[idx++] = rte_get_main_lcore(); for (i = 0; i < DPVS_MAX_LCORE; i++) if (g_lcore_role[i] == LCORE_ROLE_FWD_WORKER) @@ -1367,7 +1354,7 @@ static void lcore_role_init(void) /* invalidate the disabled cores */ g_lcore_role[cid] = LCORE_ROLE_MAX; - cid = rte_get_master_lcore(); + cid = rte_get_main_lcore(); assert(g_lcore_role[cid] == LCORE_ROLE_IDLE); g_lcore_role[cid] = LCORE_ROLE_MASTER; @@ -1755,7 +1742,7 @@ static int build_port_queue_lcore_map(void) dev = netif_port_get(pid); if (dev) { - ether_format_addr(pql_map[pid].mac_addr, + rte_ether_format_addr(pql_map[pid].mac_addr, sizeof(pql_map[pid].mac_addr), &dev->addr); } } @@ -1915,7 +1902,7 @@ int netif_print_lcore_queue_conf(lcoreid_t cid, char *buf, int *len, bool has_ti if (unlikely(!buf || !len || *len <= 0)) return EDPVS_INVAL; - if (unlikely(rte_get_master_lcore() == cid)) { + if (unlikely(rte_get_main_lcore() == cid)) { buf[0] = '\0'; *len = 0; return EDPVS_OK; @@ -2021,7 +2008,7 @@ static int netif_print_isol_lcore_conf(lcoreid_t cid, char *buf, int *len, bool static inline void netif_tx_burst(lcoreid_t cid, portid_t pid, queueid_t qindex) { - int ntx, ii; + int ntx; struct netif_queue_conf *txq; unsigned i = 0; struct rte_mbuf *mbuf_copied = NULL; @@ -2037,7 +2024,7 @@ static inline void netif_tx_burst(lcoreid_t cid, portid_t pid, queueid_t qindex) for (; i < txq->len; i++) { if (NULL == (mbuf_copied = mbuf_copy(txq->mbufs[i], pktmbuf_pool[dev->socket]))) - RTE_LOG(WARNING, NETIF, "%s: Failed to copy mbuf\n", __func__); + RTE_LOG(WARNING, NETIF, "%s: fail to copy outbound mbuf into kni\n", __func__); else kni_ingress(mbuf_copied, dev); } @@ -2047,10 +2034,12 @@ static inline void netif_tx_burst(lcoreid_t cid, portid_t pid, queueid_t qindex) lcore_stats[cid].opackets += ntx; /* do not calculate obytes here in consideration of efficency */ if (unlikely(ntx < txq->len)) { - RTE_LOG(DEBUG, NETIF, "Fail to send %d packets on dpdk%d tx%d\n", ntx,pid, txq->id); + RTE_LOG(INFO, NETIF, "fail to send %d of %d packets on dpdk port %d txq %d\n", + txq->len - ntx, txq->len, pid, txq->id); lcore_stats[cid].dropped += txq->len - ntx; - for (ii = ntx; ii < txq->len; ii++) - rte_pktmbuf_free(txq->mbufs[ii]); + do { + rte_pktmbuf_free(txq->mbufs[ntx]); + } while (++ntx < txq->len); } } @@ -2165,9 +2154,9 @@ int netif_hard_xmit(struct rte_mbuf *mbuf, struct netif_port *dev) cid = rte_lcore_id(); if (likely(mbuf->ol_flags & PKT_TX_IP_CKSUM)) - mbuf->l2_len = sizeof(struct ether_hdr); + mbuf->l2_len = sizeof(struct rte_ether_hdr); - if (rte_get_master_lcore() == cid) { // master thread + if (rte_get_main_lcore() == cid) { // master thread struct dpvs_msg *msg; struct master_xmit_msg_data msg_data; @@ -2178,7 +2167,7 @@ int netif_hard_xmit(struct rte_mbuf *mbuf, struct netif_port *dev) msg_data.mbuf = mbuf; msg_data.dev = dev; - msg = msg_make(MSG_TYPE_MASTER_XMIT, 0, DPVS_MSG_UNICAST, rte_get_master_lcore(), + msg = msg_make(MSG_TYPE_MASTER_XMIT, 0, DPVS_MSG_UNICAST, rte_get_main_lcore(), sizeof(struct master_xmit_msg_data), &msg_data); if (unlikely(NULL == msg)) { rte_pktmbuf_free(mbuf); @@ -2203,9 +2192,9 @@ int netif_hard_xmit(struct rte_mbuf *mbuf, struct netif_port *dev) /* port id is determined by routing */ pid = dev->id; /* qindex is hashed by physical address of mbuf */ - qindex = (((uint32_t) mbuf->buf_physaddr) >> 8) % + qindex = (((uint32_t) mbuf->buf_iova) >> 8) % (lcore_conf[lcore2index[cid]].pqs[port2index[cid][pid]].ntxq); - //RTE_LOG(DEBUG, NETIF, "tx-queue hash(%x) = %d\n", ((uint32_t)mbuf->buf_physaddr) >> 8, qindex); + //RTE_LOG(DEBUG, NETIF, "tx-queue hash(%x) = %d\n", ((uint32_t)mbuf->buf_iova) >> 8, qindex); txq = &lcore_conf[lcore2index[cid]].pqs[port2index[cid][pid]].txqs[qindex]; /* No space left in txq mbufs, transmit cached mbufs immediately */ @@ -2250,14 +2239,14 @@ int netif_xmit(struct rte_mbuf *mbuf, struct netif_port *dev) return netif_hard_xmit(mbuf, dev); } -static inline eth_type_t eth_type_parse(const struct ether_hdr *eth_hdr, +static inline eth_type_t eth_type_parse(const struct rte_ether_hdr *eth_hdr, const struct netif_port *dev) { if (eth_addr_equal(&dev->addr, ð_hdr->d_addr)) return ETH_PKT_HOST; - if (is_multicast_ether_addr(ð_hdr->d_addr)) { - if (is_broadcast_ether_addr(ð_hdr->d_addr)) + if (rte_is_multicast_ether_addr(ð_hdr->d_addr)) { + if (rte_is_broadcast_ether_addr(ð_hdr->d_addr)) return ETH_PKT_BROADCAST; else return ETH_PKT_MULTICAST; @@ -2284,12 +2273,12 @@ static int netif_deliver_mbuf(struct netif_port *dev, lcoreid_t cid, struct rte_mbuf *mbuf, bool pkts_from_ring) { int ret = EDPVS_OK; - struct ether_hdr *eth_hdr; + struct rte_ether_hdr *eth_hdr; assert(mbuf->port <= NETIF_MAX_PORTS); assert(dev != NULL); - eth_hdr = rte_pktmbuf_mtod(mbuf, struct ether_hdr *); + eth_hdr = rte_pktmbuf_mtod(mbuf, struct rte_ether_hdr *); /* reuse mbuf.packet_type, it was RTE_PTYPE_XXX */ mbuf->packet_type = eth_type_parse(eth_hdr, dev); @@ -2319,13 +2308,13 @@ static int netif_deliver_mbuf(struct netif_port *dev, lcoreid_t cid, int netif_rcv_mbuf(struct netif_port *dev, lcoreid_t cid, struct rte_mbuf *mbuf, bool pkts_from_ring) { - struct ether_hdr *eth_hdr; + struct rte_ether_hdr *eth_hdr; struct pkt_type *pt; int err; uint16_t data_off; bool forward2kni; - eth_hdr = rte_pktmbuf_mtod(mbuf, struct ether_hdr *); + eth_hdr = rte_pktmbuf_mtod(mbuf, struct rte_ether_hdr *); /* * do not drop pkt to other hosts (ETH_PKT_OTHERHOST) * since virtual devices may have different MAC with @@ -2344,7 +2333,7 @@ int netif_rcv_mbuf(struct netif_port *dev, lcoreid_t cid, struct rte_mbuf *mbuf, dev = netif_port_get(mbuf->port); if (unlikely(!dev)) goto drop; - eth_hdr = rte_pktmbuf_mtod(mbuf, struct ether_hdr *); + eth_hdr = rte_pktmbuf_mtod(mbuf, struct rte_ether_hdr *); } forward2kni = (dev->flag & NETIF_PORT_FLAG_FORWARD2KNI) ? true : false; @@ -2358,16 +2347,16 @@ int netif_rcv_mbuf(struct netif_port *dev, lcoreid_t cid, struct rte_mbuf *mbuf, } /* clone arp pkt to every queue */ - if (unlikely(pt->type == rte_cpu_to_be_16(ETHER_TYPE_ARP) && !pkts_from_ring)) { + if (unlikely(pt->type == rte_cpu_to_be_16(RTE_ETHER_TYPE_ARP) && !pkts_from_ring)) { uint8_t i; - struct arp_hdr *arp; + struct rte_arp_hdr *arp; struct rte_mbuf *mbuf_clone; - arp = rte_pktmbuf_mtod_offset(mbuf, struct arp_hdr *, sizeof(struct ether_hdr)); - if (rte_be_to_cpu_16(arp->arp_op) == ARP_OP_REPLY) { + arp = rte_pktmbuf_mtod_offset(mbuf, struct rte_arp_hdr *, sizeof(struct rte_ether_hdr)); + if (rte_be_to_cpu_16(arp->arp_opcode) == RTE_ARP_OP_REPLY) { for (i = 0; i < DPVS_MAX_LCORE; i++) { if ((i == cid) || (!is_lcore_id_fwd(i)) - || (i == rte_get_master_lcore())) + || (i == rte_get_main_lcore())) continue; /* rte_pktmbuf_clone will not clone pkt.data, just copy pointer! */ mbuf_clone = rte_pktmbuf_clone(mbuf, pktmbuf_pool[rte_socket_id()]); @@ -2389,11 +2378,11 @@ int netif_rcv_mbuf(struct netif_port *dev, lcoreid_t cid, struct rte_mbuf *mbuf, } } - mbuf->l2_len = sizeof(struct ether_hdr); + mbuf->l2_len = sizeof(struct rte_ether_hdr); /* Remove ether_hdr at the beginning of an mbuf */ data_off = mbuf->data_off; - if (unlikely(NULL == rte_pktmbuf_adj(mbuf, sizeof(struct ether_hdr)))) + if (unlikely(NULL == rte_pktmbuf_adj(mbuf, sizeof(struct rte_ether_hdr)))) goto drop; err = pt->func(mbuf, dev); @@ -2665,15 +2654,17 @@ static int update_bond_macaddr(struct netif_port *port) { assert(port->type == PORT_TYPE_BOND_MASTER); - int ret = EDPVS_OK; - rte_eth_macaddr_get(port->id, &port->addr); + if (rte_eth_macaddr_get(port->id, &port->addr)) + return EDPVS_NOTEXIST; + if (kni_dev_exist(port)) { - ret = linux_set_if_mac(port->kni.name, (unsigned char *)&port->addr); - if (ret == EDPVS_OK) - ether_addr_copy(&port->addr, &port->kni.addr); + /* if kni device isn't link up, linux_set_if_mac would fail(Timer expired), + * and in this case the warning can be ingored.*/ + linux_set_if_mac(port->kni.name, (unsigned char *)&port->addr); + rte_ether_addr_copy(&port->addr, &port->kni.addr); } - return ret; + return EDPVS_OK; } static inline void free_mbufs(struct rte_mbuf **pkts, unsigned num) @@ -2906,7 +2897,7 @@ struct netif_port *netif_alloc(size_t priv_size, const char *namefmt, snprintf(dev->name, sizeof(dev->name), "%s", namefmt); dev->socket = SOCKET_ID_ANY; - dev->hw_header_len = sizeof(struct ether_hdr); /* default */ + dev->hw_header_len = sizeof(struct rte_ether_hdr); /* default */ if (setup) setup(dev); @@ -2983,49 +2974,9 @@ static int bond_set_mc_list(struct netif_port *dev) return err; } -static int bond_filter_supported(struct netif_port *dev, enum rte_filter_type fltype) -{ - int i, err = EDPVS_NOTSUPP; - struct netif_port *slave; - - if (dev->type != PORT_TYPE_BOND_MASTER) - return EDPVS_INVAL; - - for (i = 0; i < dev->bond->master.slave_nb; i++) { - slave = dev->bond->master.slaves[i]; - err = rte_eth_dev_filter_supported(slave->id, fltype); - if (err < 0) - return err; - } - - return err; -} - -static int bond_set_fdir_filt(struct netif_port *dev, enum rte_filter_op op, - const struct rte_eth_fdir_filter *filt) -{ - int i, err; - struct netif_port *slave; - - if (dev->type != PORT_TYPE_BOND_MASTER) - return EDPVS_INVAL; - - for (i = 0; i < dev->bond->master.slave_nb; i++) { - slave = dev->bond->master.slaves[i]; - err = netif_fdir_filter_set(slave, op, filt); - if (err != EDPVS_OK) { - RTE_LOG(WARNING, NETIF, "%s: fail to set %s's fdir filter - %d\n", - __func__, slave->name, err); - return err; - } - } - - return EDPVS_OK; -} - static int dpdk_set_mc_list(struct netif_port *dev) { - struct ether_addr addrs[NETIF_MAX_HWADDR]; + struct rte_ether_addr addrs[NETIF_MAX_HWADDR]; int err; int ret; size_t naddr = NELEMS(addrs); @@ -3051,92 +3002,13 @@ static int dpdk_set_mc_list(struct netif_port *dev) return EDPVS_OK; } -static int dpdk_filter_supported(struct netif_port *dev, enum rte_filter_type fltype) -{ - return rte_eth_dev_filter_supported(dev->id, fltype); -} - -void netif_mask_fdir_filter(int af, const struct netif_port *port, - struct rte_eth_fdir_filter *filt) -{ - struct rte_eth_fdir_info fdir_info; - const struct rte_eth_fdir_masks *fmask; - union rte_eth_fdir_flow *flow = &filt->input.flow; - - /* There exists a defect here. If the netif_port 'port' is not PORT_TYPE_GENERAL, - mask fdir_filter of the port would fail. The correct way to accomplish the - function is to register this method for all device types. Considering the flow - is not changed after masking, we just skip netif_ports other than physical ones. */ - if (port->type != PORT_TYPE_GENERAL) - return; - - if (rte_eth_dev_filter_ctrl(port->id, RTE_ETH_FILTER_FDIR, - RTE_ETH_FILTER_INFO, &fdir_info) < 0) { - RTE_LOG(DEBUG, NETIF, "%s: Fail to fetch fdir info of %s !\n", - __func__, port->name); - return; - } - fmask = &fdir_info.mask; - - /* ipv4 flow */ - if (af == AF_INET) { - flow->ip4_flow.src_ip &= fmask->ipv4_mask.src_ip; - flow->ip4_flow.dst_ip &= fmask->ipv4_mask.dst_ip; - flow->ip4_flow.tos &= fmask->ipv4_mask.tos; - flow->ip4_flow.ttl &= fmask->ipv4_mask.ttl; - flow->ip4_flow.proto &= fmask->ipv4_mask.proto; - flow->tcp4_flow.src_port &= fmask->src_port_mask; - flow->tcp4_flow.dst_port &= fmask->dst_port_mask; - return; - } - - /* ipv6 flow */ - if (af == AF_INET6) { - flow->ipv6_flow.src_ip[0] &= fmask->ipv6_mask.src_ip[0]; - flow->ipv6_flow.src_ip[1] &= fmask->ipv6_mask.src_ip[1]; - flow->ipv6_flow.src_ip[2] &= fmask->ipv6_mask.src_ip[2]; - flow->ipv6_flow.src_ip[3] &= fmask->ipv6_mask.src_ip[3]; - flow->ipv6_flow.dst_ip[0] &= fmask->ipv6_mask.dst_ip[0]; - flow->ipv6_flow.dst_ip[1] &= fmask->ipv6_mask.dst_ip[1]; - flow->ipv6_flow.dst_ip[2] &= fmask->ipv6_mask.dst_ip[2]; - flow->ipv6_flow.dst_ip[3] &= fmask->ipv6_mask.dst_ip[3]; - flow->ipv6_flow.tc &= fmask->ipv6_mask.tc; - flow->ipv6_flow.proto &= fmask->ipv6_mask.proto; - flow->ipv6_flow.hop_limits &= fmask->ipv6_mask.hop_limits; - flow->tcp6_flow.src_port &= fmask->src_port_mask; - flow->tcp6_flow.dst_port &= fmask->dst_port_mask; - return; - } -} - -static int dpdk_set_fdir_filt(struct netif_port *dev, enum rte_filter_op op, - const struct rte_eth_fdir_filter *filt) -{ - int ret; - - rte_rwlock_write_lock(&dev->dev_lock); - ret = rte_eth_dev_filter_ctrl(dev->id, - RTE_ETH_FILTER_FDIR, op, (void *)filt); - rte_rwlock_write_unlock(&dev->dev_lock); - if (ret < 0) { - RTE_LOG(WARNING, NETIF, "%s: fdir filt set failed for %s -- %s(%d)\n!", - __func__, dev->name, rte_strerror(-ret), ret); - return EDPVS_DPDKAPIFAIL; - } - - return EDPVS_OK; -} - static struct netif_ops dpdk_netif_ops = { .op_set_mc_list = dpdk_set_mc_list, - .op_set_fdir_filt = dpdk_set_fdir_filt, - .op_filter_supported = dpdk_filter_supported, }; static struct netif_ops bond_netif_ops = { + .op_update_addr = update_bond_macaddr, .op_set_mc_list = bond_set_mc_list, - .op_set_fdir_filt = bond_set_fdir_filt, - .op_filter_supported = bond_filter_supported, }; static inline void setup_dev_of_flags(struct netif_port *port) @@ -3223,11 +3095,11 @@ static struct netif_port* netif_rte_port_alloc(portid_t id, int nrxq, port->nrxq = nrxq; // update after port_rx_queues_get(); port->ntxq = ntxq; // update after port_tx_queues_get(); port->socket = rte_eth_dev_socket_id(id); - port->hw_header_len = sizeof(struct ether_hdr); + port->hw_header_len = sizeof(struct rte_ether_hdr); if (port->socket == SOCKET_ID_ANY) port->socket = rte_socket_id(); port->mbuf_pool = pktmbuf_pool[port->socket]; - rte_eth_macaddr_get((uint8_t)id, &port->addr); + rte_eth_macaddr_get((uint8_t)id, &port->addr); // bonding mac is zero here rte_eth_dev_get_mtu((uint8_t)id, &port->mtu); rte_eth_dev_info_get((uint8_t)id, &port->dev_info); port->dev_conf = *conf; @@ -3354,17 +3226,6 @@ int netif_get_stats(struct netif_port *dev, struct rte_eth_stats *stats) return EDPVS_OK; } -int netif_fdir_filter_set(struct netif_port *port, enum rte_filter_op opcode, - const struct rte_eth_fdir_filter *fdir_flt) -{ - assert(port && port->netif_ops); - - if (!port->netif_ops->op_set_fdir_filt) - return EDPVS_NOTSUPP; - - return port->netif_ops->op_set_fdir_filt(port, opcode, fdir_flt); -} - int netif_port_conf_get(struct netif_port *port, struct rte_eth_conf *eth_conf) { @@ -3414,33 +3275,6 @@ static inline void port_mtu_set(struct netif_port *port) } -/* - * fdir mask must be set according to configured slave lcore number - * */ -inline static int netif_port_fdir_dstport_mask_set(struct netif_port *port) -{ - uint8_t slave_nb; - int shift; - - netif_get_slave_lcores(&slave_nb, NULL); - for (shift = 0; (0x1 << shift) < slave_nb; shift++) - ; - if (shift >= 16) { - RTE_LOG(ERR, NETIF, "%s: %s's fdir dst_port_mask init failed\n", - __func__, port->name); - return EDPVS_NOTSUPP; - } -#if RTE_VERSION >= 0x10040010 - port->dev_conf.fdir_conf.mask.dst_port_mask = htons(~((~0x0) << shift)); -#else - port->dev_conf.fdir_conf.mask.dst_port_mask = ~((~0x0) << shift); -#endif - - RTE_LOG(INFO, NETIF, "%s:dst_port_mask=%0x\n", port->name, - port->dev_conf.fdir_conf.mask.dst_port_mask); - return EDPVS_OK; -} - static int rss_resolve_proc(char *rss) { int rss_value = 0; @@ -3540,11 +3374,7 @@ static void fill_port_config(struct netif_port *port, char *promisc_on) port->dev_conf.rx_adv_conf.rss_conf.rss_hf |= rss_resolve_proc(rss); } - port->dev_conf.fdir_conf.mode = cfg_stream->fdir_mode; - port->dev_conf.fdir_conf.pballoc = cfg_stream->fdir_pballoc; - port->dev_conf.fdir_conf.status = cfg_stream->fdir_status; port->mtu = cfg_stream->mtu; - if (cfg_stream->rx_queue_nb > 0 && port->nrxq > cfg_stream->rx_queue_nb) { RTE_LOG(WARNING, NETIF, "%s: rx-queues(%d) configured in workers != " "rx-queues(%d) configured in device, setup %d rx-queues for %s\n", @@ -3627,10 +3457,6 @@ static int add_bond_slaves(struct netif_port *port) __func__, port->name, port->bond->master.primary->name); } - if (update_bond_macaddr(port) != EDPVS_OK) { - RTE_LOG(ERR, NETIF, "%s: fail to update %s's macaddr!\n", __func__, port->name); - return EDPVS_INVAL; - } /* Add a MAC address to an internal array of addresses used to enable whitelist * * filtering to accept packets only if the destination MAC address matches */ for (ii = 0; ii < port->bond->master.slave_nb; ii++) { @@ -3648,16 +3474,23 @@ static int add_bond_slaves(struct netif_port *port) return EDPVS_OK; } -/* flush FDIR filters for all physical dpdk ports */ -static int fdir_filter_flush(const struct netif_port *port) +#ifdef CONFIG_DPVS_FDIR +static int config_fdir_conf(struct rte_fdir_conf *fdir_conf) { - if (!port || port->type != PORT_TYPE_GENERAL) - return EDPVS_OK; - if (rte_eth_dev_filter_ctrl(port->id, RTE_ETH_FILTER_FDIR, - RTE_ETH_FILTER_FLUSH, NULL) < 0) - return EDPVS_DPDKAPIFAIL; + int shift; + + /* how many mask bits needed? */ + for (shift = 0; (0x1<= 16) + return EDPVS_INVAL; + + fdir_conf->mask.dst_port_mask = htons(~((~0x0) << shift)); + fdir_conf->mode = g_fdir_mode; + return EDPVS_OK; } +#endif /* * Note: Invoke the function after port is allocated and lcores are configured. @@ -3665,6 +3498,7 @@ static int fdir_filter_flush(const struct netif_port *port) int netif_port_start(struct netif_port *port) { int ii, ret; + lcoreid_t cid; queueid_t qid; char promisc_on; char buf[512]; @@ -3691,11 +3525,13 @@ int netif_port_start(struct netif_port *port) } // device configure - if ((ret = netif_port_fdir_dstport_mask_set(port)) != EDPVS_OK) - return ret; if ((ret = rte_eth_dev_set_mtu(port->id,port->mtu)) != EDPVS_OK) return ret; - +#ifdef CONFIG_DPVS_FDIR + ret = config_fdir_conf(&port->dev_conf.fdir_conf); + if (ret != EDPVS_OK) + return ret; +#endif if (port->flag & NETIF_PORT_FLAG_TX_IP_CSUM_OFFLOAD) port->dev_conf.txmode.offloads |= DEV_TX_OFFLOAD_IPV4_CKSUM; if (port->flag & NETIF_PORT_FLAG_TX_UDP_CSUM_OFFLOAD) @@ -3755,7 +3591,7 @@ int netif_port_start(struct netif_port *port) } netif_print_port_conf(&port->dev_conf, buf, &buflen); - RTE_LOG(INFO, NETIF, "device %s configuration:\n%s\n\n", port->name, buf); + RTE_LOG(INFO, NETIF, "device %s configuration:\n%s\n", port->name, buf); // build port-queue-lcore mapping array build_port_queue_lcore_map(); @@ -3793,15 +3629,13 @@ int netif_port_start(struct netif_port *port) rte_eth_promiscuous_enable(port->id); } - /* bonding device's macaddr is updated by its primary device when start, - * so we should update its macaddr after start. */ - if (port->type == PORT_TYPE_BOND_MASTER) - update_bond_macaddr(port); + /* update mac addr to netif_port and netif_kni after start */ + if (port->netif_ops->op_update_addr) + port->netif_ops->op_update_addr(port); /* add in6_addr multicast address */ - int cid = 0; - rte_eal_mp_remote_launch(idev_add_mcast_init, port, CALL_MASTER); - RTE_LCORE_FOREACH_SLAVE(cid) { + rte_eal_mp_remote_launch(idev_add_mcast_init, port, CALL_MAIN); + RTE_LCORE_FOREACH_WORKER(cid) { if ((ret = rte_eal_wait_lcore(cid)) < 0) { RTE_LOG(WARNING, NETIF, "%s: lcore %d: multicast address add failed for device %s\n", __func__, cid, port->name); @@ -3809,13 +3643,6 @@ int netif_port_start(struct netif_port *port) } } - /* flush FDIR filters */ - ret = fdir_filter_flush(port); - if (ret != EDPVS_OK) { - RTE_LOG(WARNING, NETIF, "fail to flush FDIR filters for device %s\n", port->name); - return ret; - } - return EDPVS_OK; } @@ -3955,11 +3782,18 @@ static int relate_bonding_device(void) return EDPVS_EXIST; } mport->bond->master.slaves[i] = sport; - if (!strcmp(bond_conf->slaves[i], bond_conf->primary)) + if (!strcmp(bond_conf->slaves[i], bond_conf->primary)) { mport->bond->master.primary = sport; + rte_ether_addr_copy(&sport->addr, &mport->addr); /* use primary slave's macaddr for bonding */ + } assert(sport->type == PORT_TYPE_GENERAL); - /* FIXME: all slaves share the same socket with master, otherwise kernel crash */ - sport->socket = mport->socket; + if (sport->socket != mport->socket) { + /* FIXME: all slaves share the same socket with master, otherwise kernel crash */ + RTE_LOG(WARNING, NETIF, "%s: %s is created on numa node %d, while its slave %s" + " is on numa node %d\n", __func__, mport->name, mport->socket, + sport->name, sport->socket); + sport->socket = mport->socket; + } sport->type = PORT_TYPE_BOND_SLAVE; sport->bond->slave.master = mport; } @@ -3984,35 +3818,28 @@ static struct rte_eth_conf default_port_conf = { .txmode = { .mq_mode = ETH_MQ_TX_NONE, }, +#ifdef CONFIG_DPVS_FDIR .fdir_conf = { - .mode = RTE_FDIR_MODE_PERFECT, + .mode = RTE_FDIR_MODE_PERFECT, /* maybe changed by config file */ .pballoc = RTE_FDIR_PBALLOC_64K, - .status = RTE_FDIR_REPORT_STATUS/*_ALWAYS*/, + .status = RTE_FDIR_REPORT_STATUS, .mask = { - .vlan_tci_mask = 0x0, - .ipv4_mask = { - .src_ip = 0x00000000, - .dst_ip = 0xFFFFFFFF, + .ipv4_mask = { + .dst_ip = 0xFFFFFFFF, }, - .ipv6_mask = { - .src_ip = { 0, 0, 0, 0 }, - .dst_ip = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }, + .ipv6_mask = { + .dst_ip = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }, }, - .src_port_mask = 0x0000, - /* to be changed according to slave lcore number in use */ - .dst_port_mask = 0x00F8, - - .mac_addr_byte_mask = 0x00, - .tunnel_type_mask = 0, - .tunnel_id_mask = 0, + .dst_port_mask = 0x0700, }, - .drop_queue = 127, - .flex_conf = { - .nb_payloads = 0, - .nb_flexmasks = 0, + .drop_queue = 127, + .flex_conf = { + .nb_payloads = 0, + .nb_flexmasks = 0, }, }, +#endif }; int netif_print_port_conf(const struct rte_eth_conf *port_conf, char *buf, int *len) @@ -4054,30 +3881,6 @@ int netif_print_port_conf(const struct rte_eth_conf *port_conf, char *buf, int * strncat(buf, tbuf1, *len - strlen(buf) - 1); } - memset(tbuf1, 0, sizeof(tbuf1)); - snprintf(tbuf1, sizeof(tbuf1), - "fdir ipv4 mask: src 0x%08x dst 0x%08x\n" - "fdir ipv6 mask: src 0x%08x:%08x:%08x:%08x dst 0x%08x:%08x:%08x:%08x\n" - "fdir port mask: src 0x%04x dst 0x%04x\n", - port_conf->fdir_conf.mask.ipv4_mask.src_ip, - port_conf->fdir_conf.mask.ipv4_mask.dst_ip, - port_conf->fdir_conf.mask.ipv6_mask.src_ip[0], - port_conf->fdir_conf.mask.ipv6_mask.src_ip[1], - port_conf->fdir_conf.mask.ipv6_mask.src_ip[2], - port_conf->fdir_conf.mask.ipv6_mask.src_ip[3], - port_conf->fdir_conf.mask.ipv6_mask.dst_ip[0], - port_conf->fdir_conf.mask.ipv6_mask.dst_ip[1], - port_conf->fdir_conf.mask.ipv6_mask.dst_ip[2], - port_conf->fdir_conf.mask.ipv6_mask.dst_ip[3], - port_conf->fdir_conf.mask.src_port_mask, - port_conf->fdir_conf.mask.dst_port_mask - ); - if (*len - strlen(buf) - 1 < strlen(tbuf1)) { - RTE_LOG(WARNING, NETIF, "[%s] no enough buf\n", __func__); - return EDPVS_INVAL; - } - strncat(buf, tbuf1, *len - strlen(buf) - 1); - *len = strlen(buf); return EDPVS_OK; } @@ -4185,89 +3988,91 @@ static int obtain_dpdk_bond_name(char *dst, const char *ori, size_t size) } /* - * netif_virtual_devices_add must be called before lcore_init and port_init, so calling the - * function immediately after cfgfile_init is recommended. + * netif_virtual_devices_add must be called before lcore_init and port_init, + * so it's recommended to call this function immediately after cfgfile_init. */ int netif_vdevs_add(void) { - portid_t pid; - int socket_id; + int ret; struct bond_conf_stream *bond_cfg; #ifdef NETIF_BONDING_DEBUG - int ii; + int ii, len = 0; + char slavenames[NETIF_MAX_BOND_SLAVES*IFNAMSIZ]; list_for_each_entry_reverse(bond_cfg, &bond_list, bond_list_node) { - if (!bond_cfg->primary[0]) - strncpy(bond_cfg->primary, bond_cfg->slaves[0], sizeof(bond_cfg->primary)); - printf("Add bonding device \"%s\": mode=%d, primary=%s, slaves=\"", - bond_cfg->name, bond_cfg->mode, bond_cfg->primary); - for (ii = 0; ii < NETIF_MAX_BOND_SLAVES && bond_cfg->slaves[ii][0]; ii++) - printf("%s ", bond_cfg->slaves[ii]); - printf("\"\n"); + for (ii = 0; ii < NETIF_MAX_BOND_SLAVES && bond_cfg->slaves[ii][0]; ii++) { + ret = snprintf(&slavenames[len], sizeof(slavenames)-len-1, "%s ", bond_cfg->slaves[ii]); + if (ret >= 0) + len += ret; + } + RTE_LOG(DEBUG, NETIF, "Add bonding device \"%s\"" + "\n\tmode: %d" + "\n\tprimary: %s" + "\n\tnuma_node: %d" + "\n\tslaves: %s\n", + bond_cfg->name, + bond_cfg->mode, + bond_cfg->primary[0] ? bond_cfg->primary : ii > 0 ? bond_cfg->slaves[0] : "", + bond_cfg->numa_node, + slavenames); } #endif + /* set phy_pid_end/bond_pid_base before create bonding device */ phy_pid_end = dpvs_rte_eth_dev_count(); - port_id_end = max(port_id_end, phy_pid_end); - /* set bond_pid_offset before create bonding device */ if (!list_empty(&bond_list)) bond_pid_base = phy_pid_end; list_for_each_entry_reverse(bond_cfg, &bond_list, bond_list_node) { + char bondname[IFNAMSIZ] = {'\0'}; + if (!bond_cfg->slaves[0][0]) { RTE_LOG(WARNING, NETIF, "%s: no slaves configured for %s, skip ...\n", __func__, bond_cfg->name); return EDPVS_INVAL; } + /* use the first slave as primary if not configured */ - if (!bond_cfg->primary[0]) + if (!bond_cfg->primary[0]) { + RTE_LOG(INFO, NETIF, "%s: %s primary slave is not configured, using %s\n", + __func__, bond_cfg->name, bond_cfg->slaves[0]); strncpy(bond_cfg->primary, bond_cfg->slaves[0], sizeof(bond_cfg->primary)); - /* FIXME: which socket should bonding device located on? Ideally, socket of the primary - * bonding slave. But here we cannot obtain slave port id from its name by - * "rte_lcore_to_socket_id" due to the uninitialized netif_port list. - * Here we use master lcore's socket as the compromise. Another solution is to appoint - * the socket id in the cfgfile. - * */ - socket_id = rte_lcore_to_socket_id(rte_lcore_id()); - if (socket_id < 0) { - RTE_LOG(ERR, NETIF, "%s: fail to get socket id for %s\n", - __func__, bond_cfg->name); - return EDPVS_INVAL; } - char dummy_name[IFNAMSIZ] = {'\0'}; - int rc = obtain_dpdk_bond_name(dummy_name, bond_cfg->name, IFNAMSIZ); - if (rc != EDPVS_OK) { - RTE_LOG(ERR, NETIF, "%s: wrong bond device name in config file %s\n", + ret = obtain_dpdk_bond_name(bondname, bond_cfg->name, IFNAMSIZ); + if (ret != EDPVS_OK) { + RTE_LOG(ERR, NETIF, "%s: invalid bonding device name in config file %s\n", __func__, bond_cfg->name); return EDPVS_INVAL; } - /* int pid_rc = rte_eth_bond_create(bond_cfg->name, bond_cfg->mode, socket_id); */ - int pid_rc = rte_eth_bond_create(dummy_name, bond_cfg->mode, socket_id); - if (pid_rc < 0) { - RTE_LOG(ERR, NETIF, "%s: fail to create bonding device %s(mode=%d, socket=%d)\n", - __func__, bond_cfg->name, bond_cfg->mode, socket_id); + + /* Note that all slaves' numa nodes should be the same as the one of bonding, + * otherwise the bonding and slaves cannot link up. Nevertheless, if you are + * to use slaves from different numa nodes, the dpdk patch + * [bonding: allow slaves from different numa nodes] + * should be applied, which may cause negative influence on performance. */ + ret = rte_eth_bond_create(bondname, bond_cfg->mode, bond_cfg->numa_node); + if (ret < 0) { + RTE_LOG(ERR, NETIF, "%s: fail to create bonding device %s: mode=%d, numa_node=%d\n", + __func__, bond_cfg->name, bond_cfg->mode, bond_cfg->numa_node); return EDPVS_CALLBACKFAIL; } - pid = pid_rc; - RTE_LOG(INFO, NETIF, "create bondig device %s: mode=%d, primary=%s, socket=%d\n", - bond_cfg->name, bond_cfg->mode, bond_cfg->primary, socket_id); - bond_cfg->port_id = pid; /* relate port_id with port_name, used by netif_rte_port_alloc */ - if (bond_cfg->mode == BONDING_MODE_8023AD) { - if (!rte_eth_bond_8023ad_dedicated_queues_enable(bond_cfg->port_id)) - { - RTE_LOG(INFO, NETIF, "bonding mode4 dedicated queues enable failed!\n"); + bond_cfg->port_id = ret; /* relate port_id with port_name, used by netif_rte_port_alloc */ + RTE_LOG(INFO, NETIF, "created bondig device %s: mode=%d, primary=%s, numa_node=%d\n", + bond_cfg->name, bond_cfg->mode, bond_cfg->primary, bond_cfg->numa_node); + + if (bond_cfg->mode == BONDING_MODE_8023AD && bond_cfg->options.dedicated_queues_enable) { + if (rte_eth_bond_8023ad_dedicated_queues_enable(bond_cfg->port_id)) { + RTE_LOG(INFO, NETIF, "%s: bonding mode4 dedicated queues enable failed!\n", __func__); } } } if (!list_empty(&bond_list)) { bond_pid_end = dpvs_rte_eth_dev_count(); - port_id_end = max(port_id_end, bond_pid_end); - RTE_LOG(INFO, NETIF, "bonding device port id range: [%d, %d)\n", - bond_pid_base, bond_pid_end); + RTE_LOG(INFO, NETIF, "bonding device port id range: [%d, %d)\n", bond_pid_base, bond_pid_end); } return EDPVS_OK; @@ -4281,7 +4086,7 @@ int netif_init(void) netif_port_init(); netif_lcore_init(); - g_master_lcore_id = rte_get_master_lcore(); + g_master_lcore_id = rte_get_main_lcore(); netif_get_slave_lcores(&g_slave_lcore_num, &g_slave_lcore_mask); netif_get_isol_rx_lcores(&g_isol_rx_lcore_num, &g_isol_rx_lcore_mask); @@ -4556,7 +4361,7 @@ static int get_port_basic(struct netif_port *port, void **out, size_t *out_len) strncpy(get->name, port->name, sizeof(get->name)); get->nrxq = port->nrxq; get->ntxq = port->ntxq; - ether_format_addr(get->addr, sizeof(get->addr), &port->addr); + rte_ether_format_addr(get->addr, sizeof(get->addr), &port->addr); get->socket_id = port->socket; get->mtu = port->mtu; @@ -4829,10 +4634,10 @@ static int get_bond_status(struct netif_port *port, void **out, size_t *out_len) get->slaves[i].is_active = 1; if (slaves[i] == primary) get->slaves[i].is_primary = 1; - ether_format_addr(&get->slaves[i].macaddr[0], sizeof(get->slaves[i].macaddr) - 1, &sport->addr); + rte_ether_format_addr(&get->slaves[i].macaddr[0], sizeof(get->slaves[i].macaddr) - 1, &sport->addr); } - ether_format_addr(get->macaddr, sizeof(get->macaddr), &mport->addr); + rte_ether_format_addr(get->macaddr, sizeof(get->macaddr), &mport->addr); xmit_policy = rte_eth_bond_xmit_policy_get(port->id); switch (xmit_policy) { @@ -4953,7 +4758,7 @@ static int set_lcore(const netif_lcore_set_t *lcore_cfg) static int set_port(struct netif_port *port, const netif_nic_set_t *port_cfg) { - struct ether_addr ea; + struct rte_ether_addr ea; assert(port_cfg); if (port_cfg->promisc_on) { @@ -5016,7 +4821,7 @@ static int set_port(struct netif_port *port, const netif_nic_set_t *port_cfg) (unsigned *)&ea.addr_bytes[3], (unsigned *)&ea.addr_bytes[4], (unsigned *)&ea.addr_bytes[5]); - if (is_valid_assigned_ether_addr(&ea)) { + if (rte_is_valid_assigned_ether_addr(&ea)) { if (port->type == PORT_TYPE_BOND_MASTER) { if (rte_eth_bond_mac_address_set(port->id, &ea) < 0) { RTE_LOG(WARNING, NETIF, "fail to set %s's macaddr to be %s\n", @@ -5090,9 +4895,10 @@ static int set_bond(struct netif_port *port, const netif_bond_set_t *bond_cfg) port->bond->master.slave_nb--; } } - /* ATTENITON: neighbor get macaddr from port->addr, thus it should be updated */ - if (update_bond_macaddr(port) != EDPVS_OK) - RTE_LOG(ERR, NETIF, "%s: fail to update %s's macaddr!\n", __func__, port->name); + if (port->netif_ops->op_update_addr) { + if (port->netif_ops->op_update_addr(port) != EDPVS_OK) + RTE_LOG(ERR, NETIF, "%s: fail to update %s's mac address!\n", __func__, port->name); + } break; } case OPT_PRIMARY: @@ -5106,9 +4912,10 @@ static int set_bond(struct netif_port *port, const netif_bond_set_t *bond_cfg) port->name, port->bond->master.primary->name, primary->name); port->bond->master.primary = primary; } - /* ATTENITON: neighbor get macaddr from port->addr, thus it should be updated */ - if (update_bond_macaddr(port) != EDPVS_OK) - RTE_LOG(ERR, NETIF, "%s: fail to update %s's macaddr!\n", __func__, port->name); + if (port->netif_ops->op_update_addr) { + if (port->netif_ops->op_update_addr(port) != EDPVS_OK) + RTE_LOG(ERR, NETIF, "%s: fail to update %s's mac address!\n", __func__, port->name); + } break; } case OPT_XMIT_POLICY: diff --git a/src/netif_addr.c b/src/netif_addr.c index 4bc6c66ed..da93f2e7e 100644 --- a/src/netif_addr.c +++ b/src/netif_addr.c @@ -26,7 +26,7 @@ #include "kni.h" static int __netif_hw_addr_add(struct netif_hw_addr_list *list, - const struct ether_addr *addr) + const struct rte_ether_addr *addr) { struct netif_hw_addr *ha; @@ -41,7 +41,7 @@ static int __netif_hw_addr_add(struct netif_hw_addr_list *list, if (!ha) return EDPVS_NOMEM; - ether_addr_copy(addr, &ha->addr); + rte_ether_addr_copy(addr, &ha->addr); rte_atomic32_set(&ha->refcnt, 1); ha->sync_cnt = 0; list_add_tail(&ha->list, &list->addrs); @@ -51,7 +51,7 @@ static int __netif_hw_addr_add(struct netif_hw_addr_list *list, } static int __netif_hw_addr_del(struct netif_hw_addr_list *list, - const struct ether_addr *addr) + const struct rte_ether_addr *addr) { struct netif_hw_addr *ha, *n; @@ -210,17 +210,17 @@ static int __netif_hw_addr_unsync_multiple(struct netif_hw_addr_list *to, return EDPVS_INVAL; } -int __netif_mc_add(struct netif_port *dev, const struct ether_addr *addr) +int __netif_mc_add(struct netif_port *dev, const struct rte_ether_addr *addr) { return __netif_hw_addr_add(&dev->mc, addr); } -int __netif_mc_del(struct netif_port *dev, const struct ether_addr *addr) +int __netif_mc_del(struct netif_port *dev, const struct rte_ether_addr *addr) { return __netif_hw_addr_del(&dev->mc, addr); } -int netif_mc_add(struct netif_port *dev, const struct ether_addr *addr) +int netif_mc_add(struct netif_port *dev, const struct rte_ether_addr *addr) { int err; @@ -233,7 +233,7 @@ int netif_mc_add(struct netif_port *dev, const struct ether_addr *addr) return err; } -int netif_mc_del(struct netif_port *dev, const struct ether_addr *addr) +int netif_mc_del(struct netif_port *dev, const struct rte_ether_addr *addr) { int err; @@ -272,7 +272,7 @@ void netif_mc_init(struct netif_port *dev) } int __netif_mc_dump(struct netif_port *dev, - struct ether_addr *addrs, size_t *naddr) + struct rte_ether_addr *addrs, size_t *naddr) { struct netif_hw_addr *ha; int off = 0; @@ -281,14 +281,14 @@ int __netif_mc_dump(struct netif_port *dev, return EDPVS_NOROOM; list_for_each_entry(ha, &dev->mc.addrs, list) - ether_addr_copy(&ha->addr, &addrs[off++]); + rte_ether_addr_copy(&ha->addr, &addrs[off++]); *naddr = off; return EDPVS_OK; } int netif_mc_dump(struct netif_port *dev, - struct ether_addr *addrs, size_t *naddr) + struct rte_ether_addr *addrs, size_t *naddr) { int err; @@ -302,7 +302,7 @@ int netif_mc_dump(struct netif_port *dev, int __netif_mc_print(struct netif_port *dev, char *buf, int *len, int *pnaddr) { - struct ether_addr addrs[NETIF_MAX_HWADDR]; + struct rte_ether_addr addrs[NETIF_MAX_HWADDR]; size_t naddr = NELEMS(addrs); int err, i; int strlen = 0; diff --git a/src/netif_flow.c b/src/netif_flow.c new file mode 100644 index 000000000..ed4da0874 --- /dev/null +++ b/src/netif_flow.c @@ -0,0 +1,409 @@ +/* + * DPVS is a software load balancer (Virtual Server) based on DPDK. + * + * Copyright (C) 2020 iQIYI (www.iqiyi.com). + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#include +#include "vlan.h" +#include "netif_flow.h" + +#define RTE_LOGTYPE_FLOW RTE_LOGTYPE_USER1 + +/* uncomment the macro if rte_flow pmd driver is not thread-safe. */ +// #define CONFIG_DEV_FLOW_LOCK + +/* sapool pattern stack: ETH | IP | TCP/UDP | END */ +#define SAPOOL_PATTERN_NUM 4 +/* sapool action stack: QUEUE | END */ +#define SAPOOL_ACTION_NUM 2 + +/* dpvs use only one flow group */ +#define NETIF_FLOW_GROUP 0 + +/* DPVS flow type and priority. + * The enum value matters. Lower value denotes higher priority. */ +typedef enum { + NETIF_FLOW_PRIO_SAPOOL = 1, // sapool flow rules + NETIF_FLOW_PRIO_TUNNEL, // TODO, gre tunnel flow rules + // more ... +} netif_flow_type_prio_t; + +static inline void netif_flow_lock(struct netif_port *dev) +{ +#ifdef CONFIG_DEV_FLOW_LOCK + rte_rwlock_write_lock(&dev->dev_lock); +#endif +} + +static inline void netif_flow_unlock(struct netif_port *dev) +{ +#ifdef CONFIG_DEV_FLOW_LOCK + rte_rwlock_write_unlock(&dev->dev_lock); +#endif +} + +/* + * Create a rte_flow on a physical port. + */ +static inline int __netif_flow_create(struct netif_port *dev, + const struct rte_flow_attr *attr, + const struct rte_flow_item pattern[], + const struct rte_flow_action actions[], + struct netif_flow_handler *flow) +{ + struct rte_flow_error flow_error; + + if (unlikely(!flow || !dev || (dev->type != PORT_TYPE_GENERAL && + dev->type != PORT_TYPE_BOND_SLAVE))) + return EDPVS_INVAL; + + netif_flow_lock(dev); + if (rte_flow_validate(dev->id, attr, pattern, actions, &flow_error)) { + netif_flow_unlock(dev); + RTE_LOG(WARNING, FLOW, "rte_flow_validate on %s failed -- %d, %s\n", + dev->name, flow_error.type, flow_error.message); + return EDPVS_DPDKAPIFAIL; + } + + flow->handler = rte_flow_create(dev->id, attr, pattern, actions, &flow_error); + netif_flow_unlock(dev); + if (!flow->handler) { + flow->pid = 0; + RTE_LOG(WARNING, FLOW, "rte_flow_create on %s failed -- %d, %s\n", + dev->name, flow_error.type, flow_error.message); + return EDPVS_DPDKAPIFAIL; + } + flow->pid = dev->id; + + return EDPVS_OK; +} + +/* + * Remove a specified rte_flow. + */ +static int __netif_flow_destroy(struct netif_flow_handler *flow) +{ + struct netif_port *dev; + struct rte_flow_error flow_error; + + if (unlikely(!flow || !flow->handler)) + return EDPVS_INVAL; + + dev = netif_port_get(flow->pid); + if (unlikely(!dev || (dev->type != PORT_TYPE_GENERAL && + dev->type != PORT_TYPE_BOND_SLAVE))) + return EDPVS_INVAL; + + netif_flow_lock(dev); + if (rte_flow_destroy(flow->pid, (struct rte_flow *)flow->handler, &flow_error)) { + RTE_LOG(WARNING, FLOW, "rte_flow_destroy on %s failed -- %d, %s\n", + dev->name, flow_error.type, flow_error.message); + netif_flow_unlock(dev); + return EDPVS_DPDKAPIFAIL; + } + netif_flow_unlock(dev); + + return EDPVS_OK; +} + +/* + * Create rte_flow on specified device. + */ +static int netif_flow_create(struct netif_port *dev, + const struct rte_flow_attr *attr, + const struct rte_flow_item pattern[], + const struct rte_flow_action actions[], + netif_flow_handler_param_t *flows) +{ + int err; + + if (unlikely(!dev || !flows)) + return EDPVS_INVAL; + + if (dev->type == PORT_TYPE_VLAN) { + struct vlan_dev_priv *vlan = netif_priv(dev); + if (unlikely(!vlan || !vlan->real_dev)) + return EDPVS_INVAL; + dev = vlan->real_dev; + } + + if (dev->type == PORT_TYPE_GENERAL) { + if (unlikely(flows->size < 1 || !flows->handlers)) + return EDPVS_INVAL; + err = __netif_flow_create(dev, attr, pattern, actions, &flows->handlers[0]); + flows->flow_num = (err == EDPVS_OK) ? 1 : 0; + return err; + } + + if (dev->type == PORT_TYPE_BOND_MASTER) { + int i, slave_nb; + slave_nb = dev->bond->master.slave_nb; + + if (unlikely(flows->size < slave_nb || !flows->handlers)) + return EDPVS_INVAL; + for (i = 0; i < slave_nb; i++) { + err = __netif_flow_create(dev->bond->master.slaves[i], attr, pattern, actions, &flows->handlers[i]); + if (err != EDPVS_OK) { + while (--i >= 0) + __netif_flow_destroy(&flows->handlers[i]); + return err; + } + } + flows->flow_num = slave_nb; + return EDPVS_OK; + } + + return EDPVS_INVAL; +} + +/* + * Destroy specified rte_flow. + */ +static int netif_flow_destroy(netif_flow_handler_param_t *flows) +{ + int i, err, ret = EDPVS_OK; + + if (unlikely(!flows || flows->flow_num > flows->size || !flows->handlers)) + return EDPVS_INVAL; + + for (i = 0; i < flows->flow_num; i++) { + err = __netif_flow_destroy(&flows->handlers[i]); + if (err != EDPVS_OK) + ret = err; + } + + return ret; +} + +/* + * Flush rte_flow of a physical port. + */ +static inline int __netif_flow_flush(struct netif_port *dev) +{ + struct rte_flow_error flow_error; + + if (unlikely(!dev || (dev->type != PORT_TYPE_GENERAL && + dev->type != PORT_TYPE_BOND_SLAVE))) + return EDPVS_INVAL; + + if (rte_flow_flush(dev->id, &flow_error)) { + RTE_LOG(WARNING, FLOW, "rte_flow_flush on %s failed -- %d, %s, %s\n", + dev->name, flow_error.type, flow_error.cause, flow_error.message); + return EDPVS_DPDKAPIFAIL; + } + + return EDPVS_OK; +} + +/* + * Flush rte_flow on specified device. + * + * Note: + * It invalidates all rte_flow handlers related to this device. + * If the handlers are saved elsewhere previously, don't use any of them after being flushed. + */ +int netif_flow_flush(struct netif_port *dev) +{ + if (unlikely(!dev)) + return EDPVS_INVAL; + + if (dev->type == PORT_TYPE_BOND_SLAVE) + return EDPVS_OK; + + if (dev->type == PORT_TYPE_VLAN) { + struct vlan_dev_priv *vlan = netif_priv(dev); + if (unlikely(!vlan || !vlan->real_dev)) + return EDPVS_INVAL; + dev = vlan->real_dev; + } + + if (dev->type == PORT_TYPE_GENERAL) { + if (__netif_flow_flush(dev) != EDPVS_OK) + return EDPVS_RESOURCE; + return EDPVS_OK; + } + + if (dev->type == PORT_TYPE_BOND_MASTER) { + int i, slave_nb, err; + err = EDPVS_OK; + slave_nb = dev->bond->master.slave_nb; + for (i = 0; i < slave_nb; i++) { + if (__netif_flow_flush(dev->bond->master.slaves[i]) != EDPVS_OK) + err = EDPVS_RESOURCE; + } + return err; + } + + return EDPVS_NOTSUPP; +} + +/* + * Set sa_pool flow rules. + * + * Ether | IPv4/IPv6 | TCP/UDP + */ +int netif_sapool_flow_add(struct netif_port *dev, lcoreid_t cid, + int af, const union inet_addr *addr, + __be16 port_base, __be16 port_mask, + netif_flow_handler_param_t *flows) +{ + int err, ret = EDPVS_OK, nflows = 0; + char ipbuf[64]; + struct rte_flow_attr attr = { + .group = NETIF_FLOW_GROUP, + .priority = NETIF_FLOW_PRIO_SAPOOL, + .ingress = 1, + .egress = 0, + //.transfer = 0, + }; + struct rte_flow_item pattern[SAPOOL_PATTERN_NUM]; + struct rte_flow_action action[SAPOOL_ACTION_NUM]; + netif_flow_handler_param_t resp; + + struct rte_flow_item_ipv4 ip_spec, ip_mask; + struct rte_flow_item_ipv6 ip6_spec, ip6_mask; + struct rte_flow_item_tcp tcp_spec, tcp_mask; + struct rte_flow_item_udp udp_spec, udp_mask; + + queueid_t queue_id; + struct rte_flow_action_queue queue; + + if (unlikely(!dev || !addr || !flows)) + return EDPVS_INVAL; + if (unlikely(flows->size < 4 || !flows->handlers)) + return EDPVS_INVAL; + + memset(pattern, 0, sizeof(pattern)); + memset(action, 0, sizeof(action)); + + /* create action stack */ + err = netif_get_queue(dev, cid, &queue_id); + if (unlikely(err != EDPVS_OK)) + return err; + queue.index = queue_id; + action[0].type = RTE_FLOW_ACTION_TYPE_QUEUE; + action[0].conf = &queue; + action[1].type = RTE_FLOW_ACTION_TYPE_END; + + /* create pattern stack */ + pattern[0].type = RTE_FLOW_ITEM_TYPE_ETH; + + if (af == AF_INET) { + memset(&ip_spec, 0, sizeof(struct rte_flow_item_ipv4)); + memset(&ip_mask, 0, sizeof(struct rte_flow_item_ipv4)); + ip_spec.hdr.dst_addr = addr->in.s_addr; + ip_mask.hdr.dst_addr = htonl(0xffffffff); + pattern[1].type = RTE_FLOW_ITEM_TYPE_IPV4; + pattern[1].spec = &ip_spec; + pattern[1].mask = &ip_mask; + } else if (af == AF_INET6) { + memset(&ip6_spec, 0, sizeof(struct rte_flow_item_ipv6)); + memset(&ip6_mask, 0, sizeof(struct rte_flow_item_ipv6)); + memcpy(&ip6_spec.hdr.dst_addr, &addr->in6, sizeof(ip6_spec.hdr.dst_addr)); + memset(&ip6_mask.hdr.dst_addr, 0xff, sizeof(ip6_mask.hdr.dst_addr)); + pattern[1].type = RTE_FLOW_ITEM_TYPE_IPV6; + pattern[1].spec = &ip6_spec; + pattern[1].mask = &ip6_mask; + } else { + return EDPVS_INVAL; + } + memset(&tcp_spec, 0, sizeof(struct rte_flow_item_tcp)); + memset(&tcp_mask, 0, sizeof(struct rte_flow_item_tcp)); + tcp_spec.hdr.dst_port = port_base; + tcp_mask.hdr.dst_port = port_mask; + pattern[2].type = RTE_FLOW_ITEM_TYPE_TCP; + pattern[2].spec = &tcp_spec; + pattern[2].mask = &tcp_mask; + pattern[3].type = RTE_FLOW_ITEM_TYPE_END; + + /* set tcp flow */ + resp.size = flows->size; + resp.flow_num = 0; + resp.handlers = &flows->handlers[0]; + err = netif_flow_create(dev, &attr, pattern, action, &resp); + if (err) { + ret = EDPVS_RESOURCE; + RTE_LOG(ERR, FLOW, "%s: adding tcp sapool flow failed: %s ip %s port %d(0x%04X) mask 0x%04X," + " queue %d lcore %2d\n", __func__, dev->name, + inet_ntop(af, addr, ipbuf, sizeof(ipbuf)) ? : "::", + ntohs(port_base), ntohs(port_base), ntohs(port_mask), queue_id, cid); + } else { + nflows += resp.flow_num; + RTE_LOG(INFO, FLOW, "%s: adding tcp sapool flow succeed: %s ip %s port %d(0x%04X) mask 0x%04X," + " queue %d lcore %2d\n", __func__, dev->name, + inet_ntop(af, addr, ipbuf, sizeof(ipbuf)) ? : "::", + ntohs(port_base), ntohs(port_base), ntohs(port_mask), queue_id, cid); + } + + memset(&udp_spec, 0, sizeof(struct rte_flow_item_udp)); + memset(&udp_mask, 0, sizeof(struct rte_flow_item_udp)); + udp_spec.hdr.dst_port = port_base; + udp_mask.hdr.dst_port = port_mask; + pattern[2].type = RTE_FLOW_ITEM_TYPE_UDP; + pattern[2].spec = &udp_spec; + pattern[2].mask = &udp_mask; + /* set udp flow */ + resp.size = flows->size - nflows; + resp.flow_num = 0; + resp.handlers = &flows->handlers[nflows]; + err = netif_flow_create(dev, &attr, pattern, action, &resp); + if (err) { + ret = EDPVS_RESOURCE; + RTE_LOG(ERR, FLOW, "%s: adding udp sapool flow failed: %s ip %s port %d(0x%04X) mask 0x%04X," + " queue %d lcore %2d\n", __func__, dev->name, + inet_ntop(af, addr, ipbuf, sizeof(ipbuf)) ? : "::", + ntohs(port_base), ntohs(port_base), ntohs(port_mask), queue_id, cid); + } else { + nflows += resp.flow_num; + RTE_LOG(INFO, FLOW, "%s: adding udp sapool flow succeed: %s ip %s port %d(0x%04X) mask 0x%04X," + " queue %d lcore %2d\n", __func__, dev->name, + inet_ntop(af, addr, ipbuf, sizeof(ipbuf)) ? : "::", + ntohs(port_base), ntohs(port_base), ntohs(port_mask), queue_id, cid); + } + + flows->flow_num = nflows; + return ret; +} + +/* + * Delete sa_pool flow rules. + * + * Ether | IPv4/IPv6 | TCP/UDP + */ +int netif_sapool_flow_del(struct netif_port *dev, lcoreid_t cid, + int af, const union inet_addr *addr, + __be16 port_base, __be16 port_mask, + netif_flow_handler_param_t *flows) +{ + int err, ret = EDPVS_OK; + char ipbuf[64]; + + err = netif_flow_destroy(flows); + + if (err) { + err = EDPVS_RESOURCE; + RTE_LOG(ERR, FLOW, "%s: deleting sapool flow failed: %s ip %s port %d(0x%04X) mask 0x%04X\n", + __func__, dev->name, inet_ntop(af, addr, ipbuf, sizeof(ipbuf)) ? : "::", + ntohs(port_base), ntohs(port_base), ntohs(port_mask)); + } else { + flows->flow_num = 0; + RTE_LOG(INFO, FLOW, "%s: deleting sapool flow succeed: %s ip %s port %d(0x%04X) mask 0x%04X\n", + __func__, dev->name, inet_ntop(af, addr, ipbuf, sizeof(ipbuf)) ? : "::", + ntohs(port_base), ntohs(port_base), ntohs(port_mask)); + } + + return ret; +} diff --git a/src/pdump.c b/src/pdump.c index 9fecdec33..4587c8eac 100644 --- a/src/pdump.c +++ b/src/pdump.c @@ -33,7 +33,7 @@ int pdump_init(void) #ifdef CONFIG_DPVS_PDUMP if (g_dpvs_pdump) { /* initialize packet capture framework */ - err = rte_pdump_init(NULL); + err = rte_pdump_init(); } #endif diff --git a/src/route.c b/src/route.c index 81d704792..ed0321466 100644 --- a/src/route.c +++ b/src/route.c @@ -345,7 +345,7 @@ static int route_add_del(bool add, struct in_addr* dest, struct dpvs_msg *msg; struct dp_vs_route_conf cf; - if (cid != rte_get_master_lcore()) { + if (cid != rte_get_main_lcore()) { RTE_LOG(INFO, ROUTE, "[%s] must set from master lcore\n", __func__); return EDPVS_NOTSUPP; } @@ -759,8 +759,8 @@ int route_init(void) rte_atomic32_set(&this_num_routes, 0); rte_atomic32_set(&this_num_out_routes, 0); /* master core also need routes */ - rte_eal_mp_remote_launch(route_lcore_init, NULL, CALL_MASTER); - RTE_LCORE_FOREACH_SLAVE(cid) { + rte_eal_mp_remote_launch(route_lcore_init, NULL, CALL_MAIN); + RTE_LCORE_FOREACH_WORKER(cid) { if ((err = rte_eal_wait_lcore(cid)) < 0) { RTE_LOG(WARNING, ROUTE, "%s: lcore %d: %s.\n", __func__, cid, dpvs_strerror(err)); @@ -806,8 +806,8 @@ int route_term(void) if ((err = sockopt_unregister(&route_sockopts)) != EDPVS_OK) return err; - rte_eal_mp_remote_launch(route_lcore_term, NULL, CALL_MASTER); - RTE_LCORE_FOREACH_SLAVE(cid) { + rte_eal_mp_remote_launch(route_lcore_term, NULL, CALL_MAIN); + RTE_LCORE_FOREACH_WORKER(cid) { if ((err = rte_eal_wait_lcore(cid)) < 0) { RTE_LOG(WARNING, ROUTE, "%s: lcore %d: %s.\n", __func__, cid, dpvs_strerror(err)); diff --git a/src/sa_pool.c b/src/sa_pool.c index e115cc564..1495c2f56 100644 --- a/src/sa_pool.c +++ b/src/sa_pool.c @@ -23,10 +23,10 @@ * ways to achieve the goal. one is to calc RSS the same way of * NIC to select the currect CPU for connect. * - * the way we use is based on Flow-Director (fdir), allocate + * the way we use is based on DPDK Generic Flow(rte_flow), allocate * local source (e.g., ) for each CPU core in advance. - * and redirect the back traffic to that CPU by fdir. it does not - * need too many fdir rules, the number of rules can be equal to + * and redirect the back traffic to that CPU by rte_flow. it does not + * need too many flow rules, the number of rules can be equal to * the number of CPU core. * * LVS use laddr and try to see if is used when @@ -69,130 +69,25 @@ enum { SA_F_USED = 0x01, }; -struct sa_fdir { +struct sa_flow { /* the ports one lcore can use means - * "(fdir.mask & port) == port_base" */ + * "(sa_flow.mask & port) == port_base" */ uint16_t mask; /* filter's port mask */ lcoreid_t lcore; __be16 port_base; - uint16_t soft_id; /* current unsed soft-id, - increase after use. */ uint16_t shift; }; -static struct sa_fdir sa_fdirs[DPVS_MAX_LCORE]; +static struct sa_flow sa_flows[DPVS_MAX_LCORE]; static uint8_t sa_nlcore; static uint64_t sa_lcore_mask; -static uint8_t sa_pool_hash_size = SAPOOL_DEF_HASH_SZ; - -static int __add_del_filter(int af, struct netif_port *dev, lcoreid_t cid, - const union inet_addr *dip, __be16 dport, - uint32_t filter_id[MAX_FDIR_PROTO], bool add) -{ - queueid_t queue; - int err; - enum rte_filter_op op, rop; - - struct rte_eth_fdir_filter filt[MAX_FDIR_PROTO] = { - { - .action.behavior = RTE_ETH_FDIR_ACCEPT, - .action.report_status = RTE_ETH_FDIR_REPORT_ID, - .soft_id = filter_id[0], - }, - { - .action.behavior = RTE_ETH_FDIR_ACCEPT, - .action.report_status = RTE_ETH_FDIR_REPORT_ID, - .soft_id = filter_id[1], - }, - }; - - if (af == AF_INET) { - filt[0].input.flow_type = RTE_ETH_FLOW_NONFRAG_IPV4_TCP; - filt[0].input.flow.tcp4_flow.ip.dst_ip = dip->in.s_addr; - filt[0].input.flow.tcp4_flow.dst_port = dport; - filt[1].input.flow_type = RTE_ETH_FLOW_NONFRAG_IPV4_UDP; - filt[1].input.flow.udp4_flow.ip.dst_ip = dip->in.s_addr; - filt[1].input.flow.udp4_flow.dst_port = dport; - } else if (af == AF_INET6) { - filt[0].input.flow_type = RTE_ETH_FLOW_NONFRAG_IPV6_TCP; - memcpy(filt[0].input.flow.ipv6_flow.dst_ip, &dip->in6, sizeof(struct in6_addr)); - filt[0].input.flow.tcp6_flow.dst_port = dport; - filt[1].input.flow_type = RTE_ETH_FLOW_NONFRAG_IPV6_UDP; - memcpy(filt[1].input.flow.ipv6_flow.dst_ip, &dip->in6, sizeof(struct in6_addr)); - filt[1].input.flow.udp6_flow.dst_port = dport; - } else { - return EDPVS_NOTSUPP; - } - - if (dev->netif_ops && dev->netif_ops->op_filter_supported) { - if (dev->netif_ops->op_filter_supported(dev, RTE_ETH_FILTER_FDIR) < 0) { - if (dev->nrxq <= 1) - return EDPVS_OK; - RTE_LOG(ERR, SAPOOL, "%s: FDIR is not supported by device %s. Only" - " single rxq can be configured.\n", __func__, dev->name); - return EDPVS_NOTSUPP; - } - } else { - RTE_LOG(ERR, SAPOOL, "%s: FDIR support of device %s is not known.\n", - __func__, dev->name); - return EDPVS_INVAL; - } - - err = netif_get_queue(dev, cid, &queue); - if (err != EDPVS_OK) - return err; - - filt[0].action.rx_queue = filt[1].action.rx_queue = queue; - op = add ? RTE_ETH_FILTER_ADD : RTE_ETH_FILTER_DELETE; - - netif_mask_fdir_filter(af, dev, &filt[0]); - netif_mask_fdir_filter(af, dev, &filt[1]); - - err = netif_fdir_filter_set(dev, op, &filt[0]); - if (err != EDPVS_OK) - return err; - - err = netif_fdir_filter_set(dev, op, &filt[1]); - if (err != EDPVS_OK) { - rop = add ? RTE_ETH_FILTER_DELETE : RTE_ETH_FILTER_ADD; - netif_fdir_filter_set(dev, rop, &filt[0]); - return err; - } - -#ifdef CONFIG_DPVS_SAPOOL_DEBUG - { - char ipaddr[64]; - RTE_LOG(DEBUG, SAPOOL, "FDIR: %s %s %s TCP/UDP " - "ip %s port %d (0x%04x) mask 0x%04X queue %d lcore %2d filterID %d/%d\n", - add ? "add" : "del", dev->name, - af == AF_INET ? "IPv4" : "IPv6", - inet_ntop(af, dip, ipaddr, sizeof(ipaddr)) ? : "::", - ntohs(dport), ntohs(dport), sa_fdirs[cid].mask, queue, cid, - filter_id[0], filter_id[1]); - } -#endif - - return err; -} - -static inline int sa_add_filter(int af, struct netif_port *dev, lcoreid_t cid, - const union inet_addr *dip, __be16 dport, - uint32_t filter_id[MAX_FDIR_PROTO]) -{ - return __add_del_filter(af, dev, cid, dip, dport, filter_id, true); -} - -static inline int sa_del_filter(int af, struct netif_port *dev, lcoreid_t cid, - const union inet_addr *dip, __be16 dport, - uint32_t filter_id[MAX_FDIR_PROTO]) -{ - return __add_del_filter(af, dev, cid, dip, dport, filter_id, false); -} +static uint8_t sa_pool_hash_size = SAPOOL_DEF_HASH_SZ; +static bool sapool_flow_enable = true; static int sa_pool_alloc_hash(struct sa_pool *ap, uint8_t hash_sz, - const struct sa_fdir *fdir) + const struct sa_flow *flow) { int hash; struct sa_entry_pool *pool; @@ -202,7 +97,7 @@ static int sa_pool_alloc_hash(struct sa_pool *ap, uint8_t hash_sz, uint32_t sa_entry_size; uint32_t sa_entry_num; - sa_entry_num = MAX_PORT >> fdir->shift; + sa_entry_num = MAX_PORT >> flow->shift; sa_entry_pool_size = sizeof(struct sa_entry_pool) * hash_sz; sa_entry_size = sizeof(struct sa_entry) * sa_entry_num * hash_sz; @@ -214,7 +109,7 @@ static int sa_pool_alloc_hash(struct sa_pool *ap, uint8_t hash_sz, ap->pool_hash_sz = hash_sz; sep = (struct sa_entry *)&ap->pool_hash[hash_sz]; - /* the big loop takes about 17ms */ + /* the big loop may take tens of milliseconds */ for (hash = 0; hash < hash_sz; hash++) { pool = &ap->pool_hash[hash]; @@ -223,14 +118,14 @@ static int sa_pool_alloc_hash(struct sa_pool *ap, uint8_t hash_sz, pool->used_cnt = 0; pool->free_cnt = 0; - pool->shift = fdir->shift; + pool->shift = flow->shift; pool->sa_entries = &sep[sa_entry_num * hash]; for (port = ap->low; port <= ap->high; port++) { struct sa_entry *sa; - if (fdir->mask && - ((uint16_t)port & fdir->mask) != ntohs(fdir->port_base)) + if (flow->mask && + ((uint16_t)port & flow->mask) != ntohs(flow->port_base)) continue; sa = &pool->sa_entries[(uint16_t)(port >> pool->shift)]; @@ -246,7 +141,7 @@ static int sa_pool_alloc_hash(struct sa_pool *ap, uint8_t hash_sz, static int sa_pool_free_hash(struct sa_pool *ap) { - /* FIXME: it may takes about 3ms to free the huge `sa->pool_hash`, and + /* FIXME: it may take about 3ms to free the huge `sa->pool_hash`, and * @rte_free uses a spinlock to protect its heap. If multiple workers * free their sapools simultaneously, a worker may be stuck up to 3*N ms, * where `N` is the dpvs worker number. @@ -254,7 +149,7 @@ static int sa_pool_free_hash(struct sa_pool *ap) * use mempool for sapool could solve the problem. we still use @rte_free * here considering sapool is not frequently changed. */ - rte_free(ap->pool_hash); /* it may takes up to 3ms */ + rte_free(ap->pool_hash); /* it may take up to 3ms */ ap->pool_hash_sz = 0; return EDPVS_OK; } @@ -262,23 +157,21 @@ static int sa_pool_free_hash(struct sa_pool *ap) static int sa_pool_add_filter(struct inet_ifaddr *ifa, struct sa_pool *ap, lcoreid_t cid) { - int err = EDPVS_OK; - uint32_t filtids[MAX_FDIR_PROTO]; - struct sa_fdir *fdir = &sa_fdirs[cid]; + int err; + struct sa_flow *flow = &sa_flows[cid]; - if (dp_vs_fdir_filter_enable) { - /* if add filter failed, waste some soft-id is acceptable. */ - filtids[0] = fdir->soft_id++; - filtids[1] = fdir->soft_id++; + netif_flow_handler_param_t flow_handlers = { + .size = MAX_SA_FLOW, + .flow_num = 0, + .handlers = ap->flows, + }; - err = sa_add_filter(ifa->af, ifa->idev->dev, cid, &ifa->addr, - fdir->port_base, filtids); + if (!sapool_flow_enable) + return EDPVS_OK; - if (err == EDPVS_OK) { - ap->filter_id[0] = filtids[0]; - ap->filter_id[1] = filtids[1]; - } - } + err = netif_sapool_flow_add(ifa->idev->dev, cid, ifa->af, &ifa->addr, + flow->port_base, htons(flow->mask), &flow_handlers); + ap->flow_num = flow_handlers.flow_num; return err; } @@ -286,25 +179,29 @@ static int sa_pool_add_filter(struct inet_ifaddr *ifa, struct sa_pool *ap, static int sa_pool_del_filter(struct inet_ifaddr *ifa, struct sa_pool *ap, lcoreid_t cid) { - int err = EDPVS_OK; - struct sa_fdir *fdir = &sa_fdirs[cid]; + struct sa_flow *flow = &sa_flows[cid]; - if (dp_vs_fdir_filter_enable) - err = sa_del_filter(ifa->af, ifa->idev->dev, cid, &ifa->addr, - fdir->port_base, ap->filter_id); /* thread-safe ? */ + netif_flow_handler_param_t flow_handlers = { + .size = MAX_SA_FLOW, + .flow_num = ap->flow_num, + .handlers = ap->flows, + }; - return err; + if (!sapool_flow_enable) + return EDPVS_OK; + + return netif_sapool_flow_del(ifa->idev->dev, cid, ifa->af, &ifa->addr, + flow->port_base, htons(flow->mask), &flow_handlers); } int sa_pool_create(struct inet_ifaddr *ifa, uint16_t low, uint16_t high) { int err; struct sa_pool *ap; - struct sa_fdir *fdir; lcoreid_t cid = rte_lcore_id(); if (cid > 64 || !((sa_lcore_mask & (1UL << cid)))) { - if (cid == rte_get_master_lcore()) + if (cid == rte_get_main_lcore()) return EDPVS_OK; /* no sapool on master */ return EDPVS_INVAL; } @@ -317,8 +214,6 @@ int sa_pool_create(struct inet_ifaddr *ifa, uint16_t low, uint16_t high) return EDPVS_INVAL; } - fdir = &sa_fdirs[cid]; - ap = rte_zmalloc(NULL, sizeof(struct sa_pool), 0); if (unlikely(!ap)) return EDPVS_NOMEM; @@ -329,7 +224,7 @@ int sa_pool_create(struct inet_ifaddr *ifa, uint16_t low, uint16_t high) ap->flags = 0; rte_atomic32_set(&ap->refcnt, 1); - err = sa_pool_alloc_hash(ap, sa_pool_hash_size, fdir); + err = sa_pool_alloc_hash(ap, sa_pool_hash_size, &sa_flows[cid]); if (err != EDPVS_OK) { goto free_ap; } @@ -373,7 +268,7 @@ int sa_pool_destroy(struct inet_ifaddr *ifa) lcoreid_t cid = rte_lcore_id(); if (cid > 64 || !((sa_lcore_mask & (1UL << cid)))) { - if (cid == rte_get_master_lcore()) + if (cid == rte_get_main_lcore()) return EDPVS_OK; return EDPVS_INVAL; } @@ -834,13 +729,12 @@ int sa_pool_init(void) for (cid = 0; cid < DPVS_MAX_LCORE; cid++) { if (cid >= 64 || !(sa_lcore_mask & (1L << cid))) continue; - assert(rte_lcore_is_enabled(cid) && cid != rte_get_master_lcore()); + assert(rte_lcore_is_enabled(cid) && cid != rte_get_main_lcore()); - sa_fdirs[cid].mask = ~((~0x0) << shift); - sa_fdirs[cid].lcore = cid; - sa_fdirs[cid].port_base = htons(port_base); - sa_fdirs[cid].soft_id = 0; - sa_fdirs[cid].shift = shift; + sa_flows[cid].mask = ~((~0x0) << shift); + sa_flows[cid].lcore = cid; + sa_flows[cid].port_base = htons(port_base); + sa_flows[cid].shift = shift; port_base++; } @@ -856,7 +750,7 @@ int sa_pool_term(void) /* * config file */ -static void sa_pool_hash_size_conf(vector_t tokens) +static void sa_pool_hash_size_handler(vector_t tokens) { char *str = set_value(tokens); int size; @@ -874,8 +768,26 @@ static void sa_pool_hash_size_conf(vector_t tokens) FREE_PTR(str); } +static void sa_pool_flow_enable_handler(vector_t tokens) +{ + char *str = set_value(tokens); + + if (!str) + return; + + if (!strcasecmp(str, "on")) + sapool_flow_enable = true; + if (!strcasecmp(str, "off")) + sapool_flow_enable = false; + else + RTE_LOG(WARNING, SAPOOL, "sapool_filter_enable = %s\n", sapool_flow_enable ? "on" : "off"); + + FREE_PTR(str); +} + void install_sa_pool_keywords(void) { install_keyword_root("sa_pool", NULL); - install_keyword("pool_hash_size", sa_pool_hash_size_conf, KW_TYPE_INIT); + install_keyword("pool_hash_size", sa_pool_hash_size_handler, KW_TYPE_INIT); + install_keyword("flow_enable", sa_pool_flow_enable_handler, KW_TYPE_INIT); } diff --git a/src/scheduler.c b/src/scheduler.c index 1e8cb7444..d446f378b 100644 --- a/src/scheduler.c +++ b/src/scheduler.c @@ -243,5 +243,5 @@ int dpvs_lcore_start(int is_master) { if (is_master) return dpvs_job_loop(NULL); - return rte_eal_mp_remote_launch(dpvs_job_loop, NULL, SKIP_MASTER); + return rte_eal_mp_remote_launch(dpvs_job_loop, NULL, SKIP_MAIN); } diff --git a/src/tc/cls_match.c b/src/tc/cls_match.c index 0a2772769..267f063f9 100644 --- a/src/tc/cls_match.c +++ b/src/tc/cls_match.c @@ -49,7 +49,7 @@ static int match_classify(struct tc_cls *cls, struct rte_mbuf *mbuf, { struct match_cls_priv *priv = tc_cls_priv(cls); struct dp_vs_match *m = &priv->match; - struct ether_hdr *eh = rte_pktmbuf_mtod(mbuf, struct ether_hdr *); + struct rte_ether_hdr *eh = rte_pktmbuf_mtod(mbuf, struct rte_ether_hdr *); struct iphdr *iph = NULL; struct ip6_hdr *ip6h = NULL; struct tcphdr *th; diff --git a/src/tc/sch_pfifo_fast.c b/src/tc/sch_pfifo_fast.c index 855bbfebd..26759aba4 100644 --- a/src/tc/sch_pfifo_fast.c +++ b/src/tc/sch_pfifo_fast.c @@ -60,7 +60,7 @@ static int pfifo_fast_enqueue(struct Qsch *sch, struct rte_mbuf *mbuf) struct pfifo_fast_priv *priv; struct tc_mbuf_head *qh; - struct ether_hdr *eh = rte_pktmbuf_mtod(mbuf, struct ether_hdr *); + struct rte_ether_hdr *eh = rte_pktmbuf_mtod(mbuf, struct rte_ether_hdr *); struct iphdr *iph = NULL; struct ip6_hdr *ip6h = NULL; struct vlan_ethhdr *veh; diff --git a/src/tc/tc.c b/src/tc/tc.c index 3d295bee5..3fd61e0bb 100644 --- a/src/tc/tc.c +++ b/src/tc/tc.c @@ -128,7 +128,7 @@ struct rte_mbuf *tc_hook(struct netif_tc *tc, struct rte_mbuf *mbuf, if (flags & QSCH_F_INGRESS) { sch = tc->qsch_ingress; /* mbuf->packet_type was not set by DPVS for ingress */ - pkt_type = rte_pktmbuf_mtod(mbuf, struct ether_hdr *)->ether_type; + pkt_type = rte_pktmbuf_mtod(mbuf, struct rte_ether_hdr *)->ether_type; } else { sch = tc->qsch; pkt_type = rte_cpu_to_be_16(mbuf->packet_type); diff --git a/src/timer.c b/src/timer.c index a7a195db4..26d744f93 100644 --- a/src/timer.c +++ b/src/timer.c @@ -427,8 +427,8 @@ int dpvs_timer_init(void) int err; /* per-lcore timer */ - rte_eal_mp_remote_launch(timer_lcore_init, NULL, SKIP_MASTER); - RTE_LCORE_FOREACH_SLAVE(cid) { + rte_eal_mp_remote_launch(timer_lcore_init, NULL, SKIP_MAIN); + RTE_LCORE_FOREACH_WORKER(cid) { err = rte_eal_wait_lcore(cid); if (err < 0) { RTE_LOG(ERR, DTIMER, "%s: lcore %d: %s.\n", @@ -438,7 +438,7 @@ int dpvs_timer_init(void) } /* global timer */ - return timer_init_schedler(&g_timer_sched, rte_get_master_lcore()); + return timer_init_schedler(&g_timer_sched, rte_get_main_lcore()); } int dpvs_timer_term(void) @@ -447,8 +447,8 @@ int dpvs_timer_term(void) int err; /* per-lcore timer */ - rte_eal_mp_remote_launch(timer_lcore_term, NULL, SKIP_MASTER); - RTE_LCORE_FOREACH_SLAVE(cid) { + rte_eal_mp_remote_launch(timer_lcore_term, NULL, SKIP_MAIN); + RTE_LCORE_FOREACH_WORKER(cid) { err = rte_eal_wait_lcore(cid); if (err < 0) { RTE_LOG(WARNING, DTIMER, "%s: lcore %d: %s.\n", @@ -464,7 +464,7 @@ static inline struct timer_scheduler *this_lcore_sched(bool global) { /* any lcore (including master and slaves) can use global timer, * but only slave lcores can use per-lcore timer. */ - if (!global && rte_lcore_id() == rte_get_master_lcore()) { + if (!global && rte_lcore_id() == rte_get_main_lcore()) { RTE_LOG(ERR, DTIMER, "try get per-lcore timer from master\n"); return NULL; } diff --git a/src/vlan.c b/src/vlan.c index 10b35824b..46b44679c 100644 --- a/src/vlan.c +++ b/src/vlan.c @@ -78,7 +78,7 @@ static int alloc_vlan_info(struct netif_port *dev) static int vlan_xmit(struct rte_mbuf *mbuf, struct netif_port *dev) { struct vlan_dev_priv *vlan = netif_priv(dev); - struct ether_hdr *ethhdr = rte_pktmbuf_mtod(mbuf, struct ether_hdr *); + struct rte_ether_hdr *ethhdr = rte_pktmbuf_mtod(mbuf, struct rte_ether_hdr *); unsigned int len; int err; @@ -130,29 +130,6 @@ static int vlan_set_mc_list(struct netif_port *dev) return err; } -static int vlan_filter_supported(struct netif_port *dev, enum rte_filter_type fltype) -{ - struct netif_port *rdev; - struct vlan_dev_priv *vlan = netif_priv(dev); - assert(vlan && vlan->real_dev); - - rdev = vlan->real_dev; - - if (!rdev->netif_ops || !rdev->netif_ops->op_filter_supported) - return EDPVS_NOTSUPP; - - return rdev->netif_ops->op_filter_supported(rdev, fltype); -} - -static int vlan_set_fdir_filt(struct netif_port *dev, enum rte_filter_op op, - const struct rte_eth_fdir_filter *filt) -{ - struct vlan_dev_priv *vlan = netif_priv(dev); - assert(vlan && vlan->real_dev); - - return netif_fdir_filter_set(vlan->real_dev, op, filt); -} - static int vlan_get_queue(struct netif_port *dev, lcoreid_t cid, queueid_t *qid) { struct vlan_dev_priv *vlan = netif_priv(dev); @@ -188,8 +165,6 @@ static int vlan_get_stats(struct netif_port *dev, struct rte_eth_stats *stats) static struct netif_ops vlan_netif_ops = { .op_xmit = vlan_xmit, .op_set_mc_list = vlan_set_mc_list, - .op_filter_supported = vlan_filter_supported, - .op_set_fdir_filt = vlan_set_fdir_filt, .op_get_queue = vlan_get_queue, .op_get_link = vlan_get_link, .op_get_promisc = vlan_get_promisc, @@ -200,7 +175,7 @@ static void vlan_setup(struct netif_port *dev) { dev->netif_ops = &vlan_netif_ops; dev->mtu = VLAN_ETH_DATA_LEN; - dev->hw_header_len = sizeof(struct ether_hdr) + VLAN_HLEN; + dev->hw_header_len = sizeof(struct rte_ether_hdr) + VLAN_HLEN; } /* @ifname is optional or vlan dev name will be auto generated. */ @@ -258,7 +233,7 @@ int vlan_add_dev(struct netif_port *real_dev, const char *ifname, dev->flag &= ~NETIF_PORT_FLAG_TX_TCP_CSUM_OFFLOAD; dev->flag &= ~NETIF_PORT_FLAG_TX_UDP_CSUM_OFFLOAD; dev->type = PORT_TYPE_VLAN; - ether_addr_copy(&real_dev->addr, &dev->addr); + rte_ether_addr_copy(&real_dev->addr, &dev->addr); vlan = netif_priv(dev); memset(vlan, 0, sizeof(*vlan)); @@ -383,8 +358,8 @@ static inline int vlan_untag_mbuf(struct rte_mbuf *mbuf) if (mbuf->ol_flags & PKT_RX_VLAN_STRIPPED) return EDPVS_OK; - if (unlikely(mbuf_may_pull(mbuf, sizeof(struct ether_hdr) + \ - sizeof(struct vlan_hdr)) != 0)) + if (unlikely(mbuf_may_pull(mbuf, sizeof(struct rte_ether_hdr) + \ + sizeof(struct rte_vlan_hdr)) != 0)) return EDPVS_INVPKT; /* the data_off of mbuf is still at ethernet header. */ @@ -409,7 +384,7 @@ int vlan_rcv(struct rte_mbuf *mbuf, struct netif_port *real_dev) { struct netif_port *dev; struct vlan_dev_priv *vlan; - struct ether_hdr *ehdr = rte_pktmbuf_mtod(mbuf, struct ether_hdr *); + struct rte_ether_hdr *ehdr = rte_pktmbuf_mtod(mbuf, struct rte_ether_hdr *); int err; err = vlan_untag_mbuf(mbuf); diff --git a/tools/dpip/eal_mem.c b/tools/dpip/eal_mem.c index 3331eef92..e861a02a7 100644 --- a/tools/dpip/eal_mem.c +++ b/tools/dpip/eal_mem.c @@ -37,13 +37,13 @@ static void list_eal_mem_seg_info(eal_all_mem_seg_ret_t *all_eal_mem_seg_ret) int i = 0; printf("%-10s %16s %16s %20s %20s %10s %10s %20s\n", - "socket_id", "phys_addr(Hex)", "virt_addr(Hex)", "len(KB)", + "socket_id", "iova(Hex)", "virt_addr(Hex)", "len(KB)", "hugepage_size(KB)","nchannel", "nrank", "free_len(KB)"); for (i = 0; i < all_eal_mem_seg_ret->seg_num; i++) { seg_ret = &all_eal_mem_seg_ret->seg_info[i]; printf("%-10d %16lx %16lx %20lu %20lu %10u %10u %20lu\n", - seg_ret->socket_id, seg_ret->phys_addr, seg_ret->virt_addr, + seg_ret->socket_id, seg_ret->iova, seg_ret->virt_addr, seg_ret->len / 1024, seg_ret->hugepage_sz / 1024, seg_ret->nchannel, seg_ret->nrank, seg_ret->free_seg_len / 1024); @@ -77,13 +77,13 @@ static void list_eal_mem_zone_info(eal_all_mem_zone_ret_t *all_eal_mem_zone_ret) int i = 0; printf("%-8s %32s %16s %16s %20s %20s %10s\n", "zone_id", - "zone_name", "phys_addr(Hex)", "virt_addr(Hex)", "len(KB)", "hugepage_size(KB)", + "zone_name", "iova(Hex)", "virt_addr(Hex)", "len(KB)", "hugepage_size(KB)", "socket_id"); for (i = 0; i < all_eal_mem_zone_ret->zone_num; i++) { zone_ret = &all_eal_mem_zone_ret->zone_info[i]; printf("%-8d %32s %16lx %16lx %20lu %20lu %10d\n", i, - zone_ret->name, zone_ret->phys_addr, zone_ret->virt_addr, + zone_ret->name, zone_ret->iova, zone_ret->virt_addr, zone_ret->len / 1024, zone_ret->hugepage_sz / 1024, zone_ret->socket_id); } } diff --git a/tools/keepalived/keepalived/check/Makefile.am b/tools/keepalived/keepalived/check/Makefile.am index 27d1ed314..8b904dc1f 100644 --- a/tools/keepalived/keepalived/check/Makefile.am +++ b/tools/keepalived/keepalived/check/Makefile.am @@ -14,7 +14,7 @@ noinst_LIBRARIES = libcheck.a libcheck_a_SOURCES = \ check_daemon.c check_data.c check_parser.c \ - check_api.c check_tcp.c check_http.c check_ssl.c \ + check_api.c check_tcp.c check_udp.c check_http.c check_ssl.c \ check_smtp.c check_misc.c check_dns.c check_print.c \ ipwrapper.c ipvswrapper.c libipvs.c sockopt.c diff --git a/tools/keepalived/keepalived/check/check_api.c b/tools/keepalived/keepalived/check/check_api.c index 3eec3dcee..ad09fdd08 100644 --- a/tools/keepalived/keepalived/check/check_api.c +++ b/tools/keepalived/keepalived/check/check_api.c @@ -38,6 +38,7 @@ #include "check_misc.h" #include "check_smtp.h" #include "check_tcp.h" +#include "check_udp.h" #include "check_http.h" #include "check_ssl.h" #include "check_dns.h" @@ -683,6 +684,7 @@ install_checkers_keyword(void) install_misc_check_keyword(); install_smtp_check_keyword(); install_tcp_check_keyword(); + install_udp_check_keyword(); install_http_check_keyword(); install_ssl_check_keyword(); install_dns_check_keyword(); diff --git a/tools/keepalived/keepalived/check/check_udp.c b/tools/keepalived/keepalived/check/check_udp.c new file mode 100644 index 000000000..5f1f3c6f6 --- /dev/null +++ b/tools/keepalived/keepalived/check/check_udp.c @@ -0,0 +1,363 @@ +/* + * Soft: Keepalived is a failover program for the LVS project + * . It monitor & manipulate + * a loadbalanced server pool using multi-layer checks. + * + * Part: UDP checker. + * + * Author: Jie Liu, + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Copyright (C) 2019-2019 Alexandre Cassen, + */ + +#include "config.h" + +/* system includes */ +#include +#include + +/* local includes */ +#include "scheduler.h" +#include "check_udp.h" +#include "check_api.h" +#include "memory.h" +#include "ipwrapper.h" +#include "layer4.h" +#include "logger.h" +#include "global_data.h" +#include "smtp.h" +#include "utils.h" +#include "parser.h" + +static int udp_connect_thread(thread_ref_t); + +/* Configuration stream handling */ +static void +free_udp_check(checker_t *checker) +{ + udp_check_t *udp_check = CHECKER_ARG(checker); + + FREE_PTR(udp_check->payload); + FREE_PTR(udp_check->reply_data); + FREE_PTR(udp_check->reply_mask); + FREE(checker->co); + FREE(checker->data); + FREE(checker); +} + +static void +dump_udp_check(FILE *fp, const checker_t *checker) +{ + udp_check_t *udp_check = CHECKER_ARG(checker); + + conf_write(fp, " Keepalive method = UDP_CHECK"); + dump_checker_opts(fp, checker); + + if (udp_check->payload) + conf_write(fp, " Payload len = %u", udp_check->payload_len); + else + conf_write(fp, " Payload specified = no"); + + conf_write(fp, " Require reply = %s", udp_check->require_reply ? "yes" : "no"); + if (udp_check->require_reply) { + conf_write(fp, " Min reply length = %u", udp_check->min_reply_len); + conf_write(fp, " Max reply length = %u", udp_check->max_reply_len); + conf_write(fp, " Reply data len = %u", udp_check->reply_len); + if (udp_check->reply_data) + conf_write(fp, " Reply data mask = %s", udp_check->reply_mask ? "yes" : "no"); + } +} + +static bool +compare_udp_check(const checker_t *a, const checker_t *b) +{ + return compare_conn_opts(a->co, b->co); +} + +//static const checker_funcs_t udp_checker_funcs = { CHECKER_UDP, free_udp_check, dump_udp_check, compare_udp_check, NULL }; + +static void +udp_check_handler(__attribute__((unused)) const vector_t *strvec) +{ + udp_check_t *udp_check = MALLOC(sizeof (udp_check_t)); + + udp_check->min_reply_len = 0; + udp_check->max_reply_len = UINT8_MAX; + + /* queue new checker */ + queue_checker(free_udp_check, dump_udp_check, udp_connect_thread, + compare_udp_check, udp_check, CHECKER_NEW_CO(), true); +} + +static void +payload_handler(const vector_t *strvec) +{ + udp_check_t *udp_check = CHECKER_GET(); + char *hex_str; + + if (vector_size(strvec) == 1) { + report_config_error(CONFIG_GENERAL_ERROR, "UDP_CHECK payload requires a payload"); + return; + } + + hex_str = make_strvec_str(strvec, 1); + udp_check->payload = STRDUP((const char*)hex_str); + udp_check->payload_len = strlen(hex_str); //read_hex_str(hex_str, &udp_check->payload, NULL); + if (!udp_check->payload_len) + report_config_error(CONFIG_GENERAL_ERROR, "Invalid hex string for UDP_CHECK payload"); + + FREE_ONLY(hex_str); +} + +static void +require_reply_handler(const vector_t *strvec) +{ + udp_check_t *udp_check = CHECKER_GET(); + char *hex_str; + + udp_check->require_reply = true; + + if (vector_size(strvec) == 1) + return; + + hex_str = make_strvec_str(strvec, 1); + udp_check->reply_data = STRDUP((const char*)hex_str); + udp_check->reply_len = strlen(hex_str); //read_hex_str(hex_str, &udp_check->reply_data, &udp_check->reply_mask); + if (!udp_check->reply_len) + report_config_error(CONFIG_GENERAL_ERROR, "Invalid hex string for UDP_CHECK reply"); + + FREE_ONLY(hex_str); +} + +static void +min_length_handler(const vector_t *strvec) +{ + udp_check_t *udp_check = CHECKER_GET(); + unsigned len; + + if (!read_unsigned_strvec(strvec, 1, &len, 0, UINT16_MAX, false)) { + report_config_error(CONFIG_GENERAL_ERROR, "UDP_CHECK min length %s not valid - must be between 0 & %d", strvec_slot(strvec, 1), UINT16_MAX); + return; + } + + udp_check->min_reply_len = len; +} + +static void +max_length_handler(const vector_t *strvec) +{ + udp_check_t *udp_check = CHECKER_GET(); + unsigned len; + + if (!read_unsigned_strvec(strvec, 1, &len, 0, UINT16_MAX, false)) { + report_config_error(CONFIG_GENERAL_ERROR, "UDP_CHECK max length %s not valid - must be between 0 & %d", strvec_slot(strvec, 1), UINT16_MAX); + return; + } + + udp_check->max_reply_len = len; +} + +static void +udp_check_end_handler(void) +{ + udp_check_t *udp_check = CHECKER_GET(); + + if (!check_conn_opts(CHECKER_GET_CO())) { + dequeue_new_checker(); + return; + } + + if (udp_check->min_reply_len > udp_check->max_reply_len) + report_config_error(CONFIG_GENERAL_ERROR, "UDP_CHECK min_reply length %d > max_reply_length %d - will always fail", + udp_check->min_reply_len, udp_check->max_reply_len); +} + +void +install_udp_check_keyword(void) +{ + /* We don't want some common keywords */ + install_keyword("UDP_CHECK", &udp_check_handler); + install_sublevel(); + install_checker_common_keywords(true); + install_keyword("payload", &payload_handler); + install_keyword("require_reply", &require_reply_handler); + install_keyword("min_reply_length", &min_length_handler); + install_keyword("max_reply_length", &max_length_handler); + install_sublevel_end_handler(udp_check_end_handler); + install_sublevel_end(); +} + +static void +udp_epilog(thread_ref_t thread, bool is_success) +{ + checker_t *checker; + unsigned long delay; + bool checker_was_up; + bool rs_was_alive; + + checker = THREAD_ARG(thread); + + delay = checker->delay_loop; + if (is_success || ((checker->is_up || !checker->has_run) && checker->retry_it >= checker->retry)) { + checker->retry_it = 0; + + if (is_success && (!checker->is_up || !checker->has_run)) { + log_message(LOG_INFO, "UDP connection to %s success." + , FMT_CHK(checker)); + checker_was_up = checker->is_up; + rs_was_alive = checker->rs->alive; + update_svr_checker_state(UP, checker); + if (checker->rs->smtp_alert && !checker_was_up && + (rs_was_alive != checker->rs->alive || !global_data->no_checker_emails)) + smtp_alert(SMTP_MSG_RS, checker, NULL, + "=> UDP CHECK succeed on service <="); + } else if (!is_success && + (checker->is_up || !checker->has_run)) { + if (checker->retry && checker->has_run) + log_message(LOG_INFO + , "UDP_CHECK on service %s failed after %u retries." + , FMT_CHK(checker) + , checker->retry); + else + log_message(LOG_INFO + , "UDP_CHECK on service %s failed." + , FMT_CHK(checker)); + checker_was_up = checker->is_up; + rs_was_alive = checker->rs->alive; + update_svr_checker_state(DOWN, checker); + if (checker->rs->smtp_alert && checker_was_up && + (rs_was_alive != checker->rs->alive || !global_data->no_checker_emails)) + smtp_alert(SMTP_MSG_RS, checker, NULL, + "=> UDP CHECK failed on service <="); + } + } else if (checker->is_up) { + delay = checker->delay_before_retry; + ++checker->retry_it; + } + + checker->has_run = true; + + thread_add_timer(thread->master, udp_connect_thread, checker, delay); +} + +static bool +check_udp_reply(const uint8_t *recv_data, size_t len, const udp_check_t *udp_check) +{ + unsigned i; + unsigned check_len; + + if (len < udp_check->min_reply_len || + len > udp_check->max_reply_len) + return true; + + /* We only checker lesser of len and udp_check->reply_len octets */ + check_len = udp_check->reply_len; + if (len < check_len) + check_len = len; + + /* Check the received data matches */ + for (i = 0; i < check_len; i++) { + if ((recv_data[i] ^ udp_check->reply_data[i])) + return true; + } + + /* Success */ + return false; +} + +static int +udp_check_thread(thread_ref_t thread) +{ + checker_t *checker = THREAD_ARG(thread); + udp_check_t *udp_check = CHECKER_ARG(checker); + int status; + uint8_t *recv_buf = NULL; + size_t len = 0; + + len = udp_check->require_reply ? (udp_check->max_reply_len + 1) : 1; + recv_buf = MALLOC(len); + + status = udp_socket_state(thread->u.f.fd, thread, recv_buf, &len); + + thread_close_fd(thread); + + if (status == connect_success) { + /* coverity[var_deref_model] - udp_check->reply_data is only set if udp_check->require_reply is set */ + if (udp_check->reply_data && check_udp_reply(recv_buf, len, udp_check)) { + if (checker->is_up && + (global_data->checker_log_all_failures || checker->log_all_failures)) + log_message(LOG_INFO, "UDP check to %s reply data mismatch." + , FMT_CHK(checker)); + udp_epilog(thread, false); + } else + udp_epilog(thread, true); + } else { + if (checker->is_up && + (global_data->checker_log_all_failures || checker->log_all_failures)) + log_message(LOG_INFO, "UDP connection to %s failed." + , FMT_CHK(checker)); + udp_epilog(thread, false); + } + + if (recv_buf) + FREE(recv_buf); + + return; +} + +static int +udp_connect_thread(thread_ref_t thread) +{ + checker_t *checker = THREAD_ARG(thread); + udp_check_t *udp_check = CHECKER_ARG(checker); + conn_opts_t *co = checker->co; + int fd; + int status; + + /* + * Register a new checker thread & return + * if checker is disabled + */ + if (!checker->enabled) { + thread_add_timer(thread->master, udp_connect_thread, checker, + checker->delay_loop); + return; + } + + if ((fd = socket(co->dst.ss_family, SOCK_DGRAM | SOCK_CLOEXEC | SOCK_NONBLOCK, IPPROTO_UDP)) == -1) { + log_message(LOG_INFO, "UDP connect fail to create socket. Rescheduling."); + thread_add_timer(thread->master, udp_connect_thread, checker, + checker->delay_loop); + + return; + } + + status = udp_bind_connect(fd, co, udp_check->payload, udp_check->payload_len); + + /* handle udp connection status & register check worker thread */ + if (udp_check_state(fd, status, thread, udp_check_thread, co->connection_to)) { + close(fd); + udp_epilog(thread, false); + } + + return; +} + +#ifdef THREAD_DUMP +void +register_check_udp_addresses(void) +{ + register_thread_address("udp_check_thread", udp_check_thread); + register_thread_address("udp_connect_thread", udp_connect_thread); +} +#endif diff --git a/tools/keepalived/keepalived/core/layer4.c b/tools/keepalived/keepalived/core/layer4.c index a450fb106..94c475a0a 100644 --- a/tools/keepalived/keepalived/core/layer4.c +++ b/tools/keepalived/keepalived/core/layer4.c @@ -23,12 +23,53 @@ #include "config.h" +#include #include #include +#include +#include +#include +#ifdef ERRQUEUE_NEEDS_SYS_TIME +#include +#endif +#include +#include #include "layer4.h" #include "logger.h" #include "scheduler.h" +#ifdef _WITH_LVS_ +#include "check_api.h" +#endif +#include "bitops.h" +#include "utils.h" +#include "align.h" + +#ifdef _WITH_LVS_ +#define UDP_BUFSIZE 32 +#endif + +#ifdef _WITH_LVS_ +void +set_buf(char *buf, size_t buf_len) +{ + const char *str = "keepalived check - "; + size_t str_len = strlen(str); + char *p = buf; + + /* We need to overwrite the send buffer to avoid leaking + * stack content. */ + + while (buf_len >= str_len) { + memcpy(p, str, str_len); + p += str_len; + buf_len -= str_len; + } + + if (buf_len) + memcpy(p, str, buf_len); +} +#endif #ifndef _WITH_LVS_ static @@ -178,4 +219,235 @@ socket_connection_state(int fd, enum connect_result status, thread_ref_t thread, return true; } + +enum connect_result +udp_bind_connect(int fd, conn_opts_t *co, uint8_t *payload, uint16_t payload_len) +{ + socklen_t addrlen; + ssize_t ret; + const struct sockaddr_storage *addr = &co->dst; + const struct sockaddr_storage *bind_addr = &co->bindto; + char buf[UDP_BUFSIZE]; + int on = 1; + int err; + + /* Ensure we don't leak our stack */ + if (!payload) { + set_buf(buf, sizeof(buf)); + payload = PTR_CAST(uint8_t, buf); + payload_len = sizeof(buf); + } + + /* We want to be able to receive ICMP error responses */ + if (co->dst.ss_family == AF_INET) + err = setsockopt(fd, SOL_IP, IP_RECVERR, PTR_CAST(char, &on), sizeof(on)); + else + err = setsockopt(fd, SOL_IPV6, IPV6_RECVERR, PTR_CAST(char, &on), sizeof(on)); + if (err) + log_message(LOG_INFO, "Error %d setting IP%s_RECVERR for socket %d - %m", errno, co->dst.ss_family == AF_INET ? "" : "V6", fd); + +#ifdef _WITH_SO_MARK_ + if (co->fwmark) { + if (setsockopt (fd, SOL_SOCKET, SO_MARK, &co->fwmark, sizeof (co->fwmark)) < 0) { + log_message(LOG_ERR, "Error setting fwmark %u to socket: %s", co->fwmark, strerror(errno)); + return connect_error; + } + } +#endif + + /* Bind socket */ + if (PTR_CAST_CONST(struct sockaddr, bind_addr)->sa_family != AF_UNSPEC) { + addrlen = sizeof(*bind_addr); + if (bind(fd, PTR_CAST_CONST(struct sockaddr, bind_addr), addrlen) != 0) { + log_message(LOG_INFO, "bind failed. errno: %d, error: %s", errno, strerror(errno)); + return connect_error; + } + } + + /* Set remote IP and connect */ + addrlen = sizeof(*addr); + ret = connect(fd, PTR_CAST_CONST(struct sockaddr, addr), addrlen); + + if (ret < 0) { + /* We want to know about the error, but not repeatedly */ + if (errno != co->last_errno) { + co->last_errno = errno; + if (__test_bit(LOG_DETAIL_BIT, &debug)) + log_message(LOG_INFO, "UDP connect error %d - %m", errno); + } + + return connect_error; + } + + /* Send udp packet */ + ret = send(fd, payload, payload_len, 0); + + if (ret == payload_len) + return connect_success; + + if (ret == -1) { + /* We want to know about the error, but not repeatedly */ + if (errno != co->last_errno) { + co->last_errno = errno; + if (__test_bit(LOG_DETAIL_BIT, &debug)) + log_message(LOG_INFO, "UDP send error %d - %m", errno); + } + } + else if (__test_bit(LOG_DETAIL_BIT, &debug)) + log_message(LOG_INFO, "udp_bind_connect send - sent %zd bytes instead of %zu", ret, sizeof(buf)); + + return connect_error; +} + +static enum connect_result +udp_socket_error(int fd) +{ + struct msghdr msg; + char name_buf[128]; + struct iovec iov; + char control[2560] __attribute__((aligned(__alignof__(struct cmsghdr)))); + struct icmphdr icmph; + struct cmsghdr *cmsg; /* Control related data */ + struct sock_extended_err *sock_err; + ssize_t n; + + iov.iov_base = &icmph; + iov.iov_len = sizeof icmph; + msg.msg_name = name_buf; + msg.msg_namelen = sizeof(name_buf); + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = control; + msg.msg_controllen = sizeof control; + msg.msg_flags = 0; + + n = recvmsg(fd, &msg, MSG_ERRQUEUE); + + if (n == -1) { + log_message(LOG_INFO, "udp_socket_error recvmsg failed - errno %d", errno); + return connect_success; + } + + for (cmsg = CMSG_FIRSTHDR(&msg); cmsg; cmsg = CMSG_NXTHDR(&msg, cmsg)) { + sock_err = PTR_CAST(struct sock_extended_err, CMSG_DATA(cmsg)); + if (cmsg->cmsg_level == SOL_IP && cmsg->cmsg_type == IP_RECVERR) { + if (sock_err) { + /* We are interested in ICMP errors */ + if (sock_err->ee_origin == SO_EE_ORIGIN_ICMP && sock_err->ee_type == ICMP_DEST_UNREACH) { +#ifdef ICMP_DEBUG + /* Handle ICMP errors types */ + switch (sock_err->ee_code) + { + case ICMP_NET_UNREACH: + /* Handle this error */ + log_message(LOG_INFO, "Network Unreachable Error"); + break; + case ICMP_HOST_UNREACH: + /* Handle this error */ + log_message(LOG_INFO, "Host Unreachable Error"); + break; + case ICMP_PORT_UNREACH: + /* Handle this error */ + log_message(LOG_INFO, "Port Unreachable Error"); + break; + default: + log_message(LOG_INFO, "Unreach code %d", sock_err->ee_code); + } +#endif + return connect_error; +#ifndef ICMP_DEBUG + } + } + } +#else + } else + log_message(LOG_INFO, "ee_origin %d, ee_type %d", sock_err->ee_origin, sock_err->ee_type); + } else + log_message(LOG_INFO, "No CMSG_DATA"); + } +#endif + else if (cmsg->cmsg_level == SOL_IPV6 && cmsg->cmsg_type == IPV6_RECVERR) { + if (sock_err) { + /* We are interested in ICMP errors */ + if (sock_err->ee_origin == SO_EE_ORIGIN_ICMP6 && sock_err->ee_type == ICMPV6_DEST_UNREACH) { +#ifdef ICMP_DEBUG + /* Handle ICMP errors types */ + switch (sock_err->ee_code) + { + case ICMPV6_NOROUTE: + /* Handle this error */ + log_message(LOG_INFO, "No Route Error"); + break; + case ICMPV6_ADDR_UNREACH: + /* Handle this error */ + log_message(LOG_INFO, "Address Unreachable Error"); + break; + case ICMPV6_PORT_UNREACH: + /* Handle this error */ + log_message(LOG_INFO, "Port Unreachable Error"); + break; + default: + log_message(LOG_INFO, "Unreach code %d", sock_err->ee_code); + } +#endif + return connect_error; +#ifndef ICMP_DEBUG + } + } + } +#else + } else + log_message(LOG_INFO, "ee_origin %d, ee_type %d", sock_err->ee_origin, sock_err->ee_type); + } else + log_message(LOG_INFO, "No CMSG_DATA"); + } + else + log_message(LOG_INFO, "cmsg_level %d, cmsg->type %d", cmsg->cmsg_level, cmsg->cmsg_type); +#endif + } + + return connect_success; +} + +enum connect_result +udp_socket_state(int fd, thread_ref_t thread, uint8_t *recv_buf, size_t *len) +{ + int ret; + + /* Handle Read timeout, we consider it success unless require_reply is set */ + if (thread->type == THREAD_READ_TIMEOUT) + return recv_buf ? connect_error : connect_success; + + if (thread->type == THREAD_READ_ERROR) + return udp_socket_error(fd); + + ret = recv(fd, recv_buf, *len, 0); + + /* Ret less than 0 means the port is unreachable. + * Otherwise, we consider it success. + */ + + if (ret < 0) + return connect_error; + + *len = ret; + return connect_success; +} + +bool +udp_check_state(int fd, enum connect_result status, thread_ref_t thread, + thread_func_t func, unsigned long timeout) +{ + checker_t *checker; + + checker = THREAD_ARG(thread); + + if (status == connect_success) { + thread_add_read(thread->master, func, checker, fd, timeout, true); + return false; + } + + return true; +} + #endif diff --git a/tools/keepalived/keepalived/include/check_udp.h b/tools/keepalived/keepalived/include/check_udp.h new file mode 100644 index 000000000..420a3672e --- /dev/null +++ b/tools/keepalived/keepalived/include/check_udp.h @@ -0,0 +1,48 @@ +/* + * Soft: Keepalived is a failover program for the LVS project + * . It monitor & manipulate + * a loadbalanced server pool using multi-layer checks. + * + * Part: check_udp.c include file. + * + * Author: Jie Liu, + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Copyright (C) 2019-2019 Alexandre Cassen, + */ + +#ifndef _CHECK_UDP_H +#define _CHECK_UDP_H + +#include "config.h" + +#include + + +typedef struct _udp_check { + uint16_t payload_len; + uint8_t *payload; + bool require_reply; + uint16_t reply_len; + uint8_t *reply_data; + uint8_t *reply_mask; + uint16_t min_reply_len; + uint16_t max_reply_len; +} udp_check_t; + +/* Prototypes defs */ +extern void install_udp_check_keyword(void); +#ifdef THREAD_DUMP +extern void register_check_udp_addresses(void); +#endif + +#endif diff --git a/tools/keepalived/keepalived/include/layer4.h b/tools/keepalived/keepalived/include/layer4.h index 3e66f1d3a..6f47bb8c2 100644 --- a/tools/keepalived/keepalived/include/layer4.h +++ b/tools/keepalived/keepalived/include/layer4.h @@ -27,6 +27,7 @@ #include #include #include +#include /* local includes */ #include "scheduler.h" @@ -49,6 +50,7 @@ typedef struct _conn_opts { #ifdef _WITH_SO_MARK_ unsigned int fwmark; /* to mark packets going out of the socket using SO_MARK */ #endif + int last_errno; /* Errno from last call to connect */ } conn_opts_t; /* Prototypes defs */ @@ -98,6 +100,10 @@ tcp_connection_state(int fd, enum connect_result status, thread_ref_t thread, { return socket_connection_state(fd, status, thread, func, timeout); } + +extern enum connect_result udp_bind_connect(int, conn_opts_t *, uint8_t *, uint16_t); +extern enum connect_result udp_socket_state(int, thread_ref_t, uint8_t *, size_t *); +extern bool udp_check_state(int, enum connect_result, thread_ref_t, thread_func_t, unsigned long); #endif #endif diff --git a/tools/keepalived/lib/align.h b/tools/keepalived/lib/align.h new file mode 100644 index 000000000..aba548b5d --- /dev/null +++ b/tools/keepalived/lib/align.h @@ -0,0 +1,111 @@ +/* + * Soft: Keepalived is a failover program for the LVS project + * . It monitor & manipulate + * a loadbalanced server pool using multi-layer checks. + * + * Part: align.h include file. + * + * Author: Quentin Armitage + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Copyright (C) 2020-2020 Alexandre Cassen, + */ + +#ifndef _ALIGN_H +#define _ALIGN_H + +#include "config.h" + +#ifdef CHECK_CAST_ALIGN +#include "logger.h" +#endif + +/* PTR_CAST and PTR_CAST_CONST should be used for all casts of pointers. + * + * PTR_CAST and PTR_CAST_CONST serve several purposes. + * + * 1) On 32 bit ARM systems which don't support unaligned memory access, configure + * will have defined CAST_VIA_VOID to avoid the compiler spewing out 1000s of + * "cast increases required alignment of target type" warnings which are caused + * due to the char * used in the cast possibly not being aligned for the pointer + * being cast to. CAST_VIA_VOID merely means that the char * is first cast to a + * void * which is then cast to the pointer of the type required. Casting via a + * void * should not alter the code produced by the compiler, since the initial + * pointer (char *) only has 1 byte alignment. + * + * This still leaves the problem that, if the keepalived code is not correct, there + * may be an unaligned pointer being used. See 2) below for how this is dealt with. + * + * On systems which do allow unaligned memory access, the warnings generated by + * not using a void * can be generated by using configure options: + * --enable-strict-cast-align --disable-cast-via-void + * + * 2) As identified in 1) above, there is a need to be able to ensure that there + * are no unaligned casts, both for performance reasons on sytems which do allow + * unaligned casts, and to ensure that there are not alignment traps, or worse + * still incorrect values returned (which happens with ARMv5) from unaligned reads. + * + * For this reason there is a configure option --enable-cast-align-checks which + * defines CHECK_CAST_ALIGN. This causes PTR_CAST and PTR_CAST_CONST to generate + * run-time code to check that casts made via PTR_CAST and PRT_CAST_CONST are + * properly aligned, and logs a message if they are not. The checks work on any + * architecture, whether unaligned memory access works or not, and so can be + * performed on Intel x86_64, aarch64 etc. + * + * Developers should periodically build with this option enabled and then run + * keepalived to check that there are no unaligned casts. 22 such instances of + * unaligned char arrays being cast to structure pointers with greater alignment + * were found when this check was first added. + * + * 3) Other cast checks can be added later by simply adding further definitions for + * PTR_CAST and PTR_CAST_CONST, probably just by adding a further definition of + * PTR_CAST_ALL. + */ + +#ifdef CAST_VIA_VOID +#define __CAST_PTR(__const) (__const void *) +#define PTR_CAST_ASSIGN (void *) +#define PTR_CAST_ASSIGN_CONST (const void *) +#else +#define __CAST_PTR(__const) +#define PTR_CAST_ASSIGN +#define PTR_CAST_ASSIGN_CONST +#endif + +#ifdef CHECK_CAST_ALIGN +#define PTR_CAST_ALL(__type, __ptr, __const) ({ \ + __const void *sav_ptr = __ptr; \ + if ((long)sav_ptr % __alignof__(__type)) \ + log_message(LOG_INFO, "Alignment error - (" #__type " *)(" #__ptr ") - alignment %zu, address %p", __alignof__(__type), sav_ptr); \ + (__const __type *) __CAST_PTR(__const) (sav_ptr); \ + }) + +#define PTR_CAST2_ALL(__type, __type1, __ptr, __field, __const) ({ \ + __const void *sav_ptr1 = __ptr; \ + if ((long)sav_ptr1 % __alignof__(__type1)) \ + printf("Alignment error - (" #__type1 " *)(" #__ptr ") - alignment %zu, address %p", __alignof__(__type1), sav_ptr1); \ + PTR_CAST_ALL(__type, &(((__const __type1 *) __CAST_PTR(__const) (sav_ptr1))->__field), __const);\ + }) +#else +#define PTR_CAST_ALL(__type, __ptr, __const) \ + ({ (__const __type *) __CAST_PTR(__const) (__ptr); }) +#define PTR_CAST2_ALL(__type, __type1, __ptr, __field, __const) \ + ({ (__const __type *) __CAST_PTR(__const) &((__const __type1 *) __CAST_PTR(__const) (__ptr))->__field; }) +#endif + +#define PTR_CAST(__type, __ptr) PTR_CAST_ALL(__type, __ptr,) +#define PTR_CAST_CONST(__type, __ptr) PTR_CAST_ALL(__type, __ptr, const) + +#define PTR_CAST2(__type, __type1, __ptr, __field) PTR_CAST2_ALL(__type, __type1, __ptr, __field,) +#define PTR_CAST2_CONST(__type, __type1, __ptr, __field) PTR_CAST2_ALL(__type, __type1, __ptr, __field, const) + +#endif diff --git a/tools/keepalived/lib/vector.c b/tools/keepalived/lib/vector.c index d5d4a3586..a5e245066 100644 --- a/tools/keepalived/lib/vector.c +++ b/tools/keepalived/lib/vector.c @@ -22,6 +22,8 @@ #include "config.h" +#include + #include "vector.h" #include "memory.h" @@ -309,6 +311,32 @@ vector_dump(FILE *fp, const vector_t *v) #endif /* String vector related */ +char * +make_strvec_str(const vector_t *v, unsigned start) +{ + size_t len; + char *str; + unsigned i; + + for (i = start, len = 0; i < v->allocated; i++) { + if (v->slot[i]) + len += strlen(v->slot[i]) + 1; + } + + str = MALLOC(len); + + for (i = start, len = 0; i < v->allocated; i++) { + if (v->slot[i]) { + if (i > start) + str[len++] = ' '; + strcpy(str + len, v->slot[i]); + len += strlen(v->slot[i]); + } + } + + return str; +} + void free_strvec(const vector_t *strvec) { diff --git a/tools/keepalived/lib/vector.h b/tools/keepalived/lib/vector.h index 42e294d5f..87b0767e0 100644 --- a/tools/keepalived/lib/vector.h +++ b/tools/keepalived/lib/vector.h @@ -73,6 +73,7 @@ extern void vector_free_r(const vector_t *); #ifdef _INCLUDE_UNUSED_CODE_ extern void vector_dump(FILE *fp, const vector_t *); #endif +extern char *make_strvec_str(const vector_t *v, unsigned start); extern void free_strvec(const vector_t *); #endif