diff --git a/README.md b/README.md index 0ecd9a8b5..c74f87204 100644 --- a/README.md +++ b/README.md @@ -38,30 +38,35 @@ Currently, besides authorized DNS server of DNSPod, there are various products i yum install numactl-devel # on Centos #sudo apt-get install libnuma-dev # on Ubuntu + # Install dependencies (FreeBSD only) + #pkg install meson pkgconf py38-pyelftools + cd f-stack # Compile DPDK cd dpdk/usertools ./dpdk-setup.sh # compile with x86_64-native-linuxapp-gcc - # Set hugepage + # Set hugepage (Linux only) # single-node system echo 1024 > /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages - # or NUMA + # or NUMA (Linux only) echo 1024 > /sys/devices/system/node/node0/hugepages/hugepages-2048kB/nr_hugepages echo 1024 > /sys/devices/system/node/node1/hugepages/hugepages-2048kB/nr_hugepages - # Using Hugepage with the DPDK + # Using Hugepage with the DPDK (Linux only) mkdir /mnt/huge mount -t hugetlbfs nodev /mnt/huge - # Close ASLR; it is necessary in multiple process + # Close ASLR; it is necessary in multiple process (Linux only) echo 0 > /proc/sys/kernel/randomize_va_space # Install python for running DPDK python scripts sudo apt install python # On ubuntu + #sudo pkg install python # On FreeBSD # Offload NIC + # For Linux: modprobe uio insmod /data/f-stack/dpdk/x86_64-native-linuxapp-gcc/kmod/igb_uio.ko insmod /data/f-stack/dpdk/x86_64-native-linuxapp-gcc/kmod/rte_kni.ko carrier=on # carrier=on is necessary, otherwise need to be up `veth0` via `echo 1 > /sys/class/net/veth0/carrier` @@ -69,6 +74,15 @@ Currently, besides authorized DNS server of DNSPod, there are various products i ifconfig eth0 down python dpdk-devbind.py --bind=igb_uio eth0 # assuming that use 10GE NIC and eth0 + # For FreeBSD: + # Refer DPDK FreeBSD guide to set tunables in /boot/loader.conf + # Below is an example used for our testing machine + #echo "hw.nic_uio.bdfs=\"2:0:0\"" >> /boot/loader.conf + #echo "hw.contigmem.num_buffers=1" >> /boot/loader.conf + #echo "hw.contigmem.buffer_size=1073741824" >> /boot/loader.conf + #kldload contigmem + #kldload nic_uio + # Install DPDK cd ../x86_64-native-linuxapp-gcc make install @@ -77,20 +91,23 @@ Currently, besides authorized DNS server of DNSPod, there are various products i #sudo apt-get install gawk # or execute `sudo update-alternatives --config awk` to choose gawk. # Install dependencies for F-Stack - sudo apt install gcc make libssl-dev # On ubuntu + sudo apt install gcc make libssl-dev # On ubuntu + #sudo pkg install gcc gmake openssl pkgconf libepoll-shim # On FreeBSD # Compile F-Stack export FF_PATH=/data/f-stack export FF_DPDK=/data/f-stack/dpdk/x86_64-native-linuxapp-gcc cd ../../lib/ - make + make # On Linux + #gmake # On FreeBSD # Install F-STACK # libfstack.a will be installed to /usr/local/lib # ff_*.h will be installed to /usr/local/include # start.sh will be installed to /usr/local/bin/ff_start # config.ini will be installed to /etc/f-stack.conf - make install + make install # On Linux + #gmake install # On FreeBSD #### Nginx diff --git a/dpdk/lib/librte_eal/freebsd/eal/include/rte_os.h b/dpdk/lib/librte_eal/freebsd/eal/include/rte_os.h index 908c37e9a..c93c87d08 100644 --- a/dpdk/lib/librte_eal/freebsd/eal/include/rte_os.h +++ b/dpdk/lib/librte_eal/freebsd/eal/include/rte_os.h @@ -18,6 +18,22 @@ extern "C" { typedef cpuset_t rte_cpuset_t; #ifdef RTE_EAL_FREEBSD_CPUSET_LEGACY +#if __FreeBSD_version >= 1301000 +#define RTE_CPU_AND(dst, src1, src2) do \ +{ \ + cpuset_t tmp; \ + CPU_COPY(src1, &tmp); \ + CPU_AND(&tmp, &tmp, src2); \ + CPU_COPY(&tmp, dst); \ +} while (0) +#define RTE_CPU_OR(dst, src1, src2) do \ +{ \ + cpuset_t tmp; \ + CPU_COPY(src1, &tmp); \ + CPU_OR(&tmp, &tmp, src2); \ + CPU_COPY(&tmp, dst); \ +} while (0) +#else #define RTE_CPU_AND(dst, src1, src2) do \ { \ cpuset_t tmp; \ @@ -32,6 +48,7 @@ typedef cpuset_t rte_cpuset_t; CPU_OR(&tmp, src2); \ CPU_COPY(&tmp, dst); \ } while (0) +#endif #define RTE_CPU_FILL(set) CPU_FILL(set) /* In FreeBSD 13 CPU_NAND macro is CPU_ANDNOT */ @@ -44,6 +61,15 @@ typedef cpuset_t rte_cpuset_t; CPU_COPY(&tmp, dst); \ } while (0) #else +#if __FreeBSD_version >= 1301000 +#define RTE_CPU_NOT(dst, src) do \ +{ \ + cpuset_t tmp; \ + CPU_FILL(&tmp); \ + CPU_ANDNOT(&tmp, &tmp, src); \ + CPU_COPY(&tmp, dst); \ +} while (0) +#else #define RTE_CPU_NOT(dst, src) do \ { \ cpuset_t tmp; \ @@ -51,6 +77,7 @@ typedef cpuset_t rte_cpuset_t; CPU_ANDNOT(&tmp, src); \ CPU_COPY(&tmp, dst); \ } while (0) +#endif #endif /* CPU_NAND */ #else /* RTE_EAL_FREEBSD_CPUSET_LEGACY */ diff --git a/example/Makefile b/example/Makefile index 1e9bb424e..167febdaf 100644 --- a/example/Makefile +++ b/example/Makefile @@ -14,7 +14,7 @@ LIBS+= -Wl,--no-whole-archive -lrt -lm -ldl -lcrypto -pthread -lnuma TARGET="helloworld" all: - cc -O -gdwarf-2 -I../lib -o ${TARGET} main.c ${LIBS} + cc -O -gdwarf-2 -I../lib -DINET6 -o ${TARGET} main.c ${LIBS} cc -O -gdwarf-2 -I../lib -o ${TARGET}_epoll main_epoll.c ${LIBS} .PHONY: clean diff --git a/lib/Makefile b/lib/Makefile index 73ee5ef16..c1e13405f 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -1,6 +1,6 @@ -# +# # Copyright (c) 2013 Patrick Kelsey. All rights reserved. -# Copyright (C) 2017 THL A29 Limited, a Tencent company. +# Copyright (C) 2017-2021 THL A29 Limited, a Tencent company. # All rights reserved. # # Derived in part from libuinet's Makefile. @@ -20,19 +20,31 @@ PREFIX_INCLUDE=/usr/local/include PREFIX_BIN=/usr/local/bin F-STACK_CONF=/etc/f-stack.conf F-STACK_VERSION=1.21 +TGT_OS=$(shell uname) +ifeq ($(TGT_OS),FreeBSD) +CC=gcc +endif HOST_OS:=$(shell uname -s) #DEBUG=-O0 -gdwarf-2 -g3 -Wno-format-truncation +# No DPDK KNI support on FreeBSD +ifneq ($(TGT_OS),FreeBSD) FF_KNI=1 +endif + #FF_FLOW_ISOLATE=1 +#FF_FDIR=1 + +# NETGRAPH drivers ipfw #FF_NETGRAPH=1 #FF_IPFW=1 + #FF_USE_PAGE_ARRAY=1 #FF_ZC_SEND=1 FF_INET6=1 - +#FF_IPSEC=1 include ${TOPDIR}/mk/kern.pre.mk @@ -45,7 +57,7 @@ endif endif ifdef RTE_SDK -ifeq (${MACHINE_CPUARCH},aarch64) +ifeq (${MACHINE_CPUARCH},aarch64) FF_DPDK=${RTE_SDK}/build else FF_DPDK=${RTE_SDK}/x86_64-native-linuxapp-gcc @@ -65,10 +77,15 @@ INCLUDES+= -I./opt # Include search path for files that only include host OS headers HOST_INCLUDES= -I. +# Use libepoll shim on FreeBSD +ifeq ($(TGT_OS),FreeBSD) +HOST_INCLUDES+= -I/usr/local/include/libepoll-shim +endif + ifndef DEBUG HOST_CFLAGS = -O2 -frename-registers -funswitch-loops -fweb -Wno-format-truncation else -HOST_CFLAGS = ${DEBUG} +HOST_CFLAGS = ${DEBUG} endif ifdef FF_KNI @@ -78,6 +95,10 @@ endif HOST_CFLAGS+= ${DPDK_CFLAGS} HOST_CFLAGS+= ${CONF_CFLAGS} +ifdef FF_FDIR +HOST_CFLAGS+= -DFF_FDIR +endif + ifdef FF_FLOW_ISOLATE HOST_CFLAGS+= -DFF_FLOW_ISOLATE endif @@ -94,11 +115,29 @@ ifdef FF_USE_PAGE_ARRAY HOST_CFLAGS+= -DFF_USE_PAGE_ARRAY endif +HOST_CFLAGS+= -DINET +CFLAGS+= -DINET + ifdef FF_INET6 HOST_CFLAGS+= -DINET6 CFLAGS+= -DINET6 endif +ifdef FF_IPSEC +HOST_CFLAGS+= -DIPSEC +CFLAGS+= -DIPSEC +endif + +GCCVERGE10 = $(shell expr `gcc -dumpversion | cut -f1 -d.` \>= 10) +ifeq "$(GCCVERGE10)" "1" + CFLAGS+= -Wno-error=stringop-overflow +endif + +GCCVERGE11 = $(shell expr `gcc -dumpversion | cut -f1 -d.` \>= 11) +ifeq "$(GCCVERGE11)" "1" + CFLAGS+= -Wno-error=stringop-overread +endif + HOST_C= ${CC} -c $(HOST_CFLAGS) ${HOST_INCLUDES} ${WERROR} ${PROF} $< @@ -223,7 +262,7 @@ FF_HOST_SRCS+= \ ff_dpdk_if.c \ ff_dpdk_pcap.c \ ff_epoll.c \ - ff_init.c + ff_init.c ifdef FF_KNI FF_HOST_SRCS+= \ @@ -331,7 +370,7 @@ LIBKERN_SRCS+= \ jenkins_hash.c \ strlcpy.c \ strnlen.c \ - zlib.c + zlib.c endif @@ -495,11 +534,15 @@ NETINET6_SRCS+= \ #ip6_ipsec.c #sctp6_usrreq.c #in6_rss.c + +ifneq ($(TGT_OS),FreeBSD) ifndef FF_KNI FF_HOST_SRCS+= \ ff_dpdk_kni.c -endif -endif +endif #FF_KNI +endif #FreeBSD OS Check + +endif #INET6 ifdef FF_IPFW NETIPFW_SRCS+= \ @@ -600,7 +643,7 @@ all: libfstack.a libfstack.a: machine_includes ff_api.symlist ${MHEADERS} ${MSRCS} ${HOST_OBJS} ${ASM_OBJS} ${OBJS} ${LD} -d -r -o $*.ro ${ASM_OBJS} ${OBJS} nm $*.ro | grep -v ' U ' | cut -d ' ' -f 3 > $*_localize_list.tmp - objcopy --localize-symbols=$*_localize_list.tmp $*.ro + objcopy --localize-symbols=$*_localize_list.tmp $*.ro rm $*_localize_list.tmp objcopy --globalize-symbols=ff_api.symlist $*.ro rm -f $@ diff --git a/lib/ff_api.h b/lib/ff_api.h index 0470c8313..3cb57de86 100644 --- a/lib/ff_api.h +++ b/lib/ff_api.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2017 THL A29 Limited, a Tencent company. + * Copyright (C) 2017-2021 THL A29 Limited, a Tencent company. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -41,7 +41,7 @@ extern "C" { struct linux_sockaddr { short sa_family; - char sa_data[126]; + char sa_data[14]; }; #define AF_INET6_LINUX 10 @@ -64,6 +64,15 @@ int ff_sysctl(const int *name, u_int namelen, void *oldp, size_t *oldlenp, int ff_ioctl(int fd, unsigned long request, ...); +/* + * While get sockfd from this API, and then need set it to non-blocking mode like this, + * Otherwise, sometimes the socket interface will not work properly, such as `ff_write()` + * + * int on = 1; + * ff_ioctl(sockfd, FIONBIO, &on); + * + * See also `example/main.c` + */ int ff_socket(int domain, int type, int protocol); int ff_setsockopt(int s, int level, int optname, const void *optval, @@ -87,6 +96,21 @@ int ff_getsockname(int s, struct linux_sockaddr *name, ssize_t ff_read(int d, void *buf, size_t nbytes); ssize_t ff_readv(int fd, const struct iovec *iov, int iovcnt); + +/* + * Write data to the socket sendspace buf. + * + * Note: + * The `fd` parameter need set non-blocking mode in advance if F-Stack's APP. + * Otherwise if the `nbytes` parameter is greater than + * `net.inet.tcp.sendspace + net.inet.tcp.sendbuf_inc`, + * the API will return -1, but not the length that has been sent. + * + * You also can modify the value of `net.inet.tcp.sendspace`(default 16384 bytes) + * and `net.inet.tcp.sendbuf_inc`(default 16384 bytes) with `config.ini`. + * But it should be noted that not all parameters can take effect, such as 32768 and 32768. + * `ff_sysctl` can see there values while APP is running. + */ ssize_t ff_write(int fd, const void *buf, size_t nbytes); ssize_t ff_writev(int fd, const struct iovec *iov, int iovcnt); @@ -106,10 +130,10 @@ int ff_select(int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds, int ff_poll(struct pollfd fds[], nfds_t nfds, int timeout); int ff_kqueue(void); -int ff_kevent(int kq, const struct kevent *changelist, int nchanges, +int ff_kevent(int kq, const struct kevent *changelist, int nchanges, struct kevent *eventlist, int nevents, const struct timespec *timeout); -int ff_kevent_do_each(int kq, const struct kevent *changelist, int nchanges, - void *eventlist, int nevents, const struct timespec *timeout, +int ff_kevent_do_each(int kq, const struct kevent *changelist, int nchanges, + void *eventlist, int nevents, const struct timespec *timeout, void (*do_each)(void **, struct kevent *)); int ff_gettimeofday(struct timeval *tv, struct timezone *tz); diff --git a/lib/ff_config.c b/lib/ff_config.c index 933fec9ae..19953aadc 100644 --- a/lib/ff_config.c +++ b/lib/ff_config.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2017 THL A29 Limited, a Tencent company. + * Copyright (C) 2017-2021 THL A29 Limited, a Tencent company. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -191,6 +191,7 @@ freebsd_conf_handler(struct ff_config *cfg, const char *section, } } else { fprintf(stderr, "freebsd conf section[%s] error\n", section); + free(newconf); return 0; } @@ -366,6 +367,81 @@ parse_port_slave_list(struct ff_port_cfg *cfg, const char *v_str) return res; } +static int +vip_cfg_handler(struct ff_port_cfg *cur) +{ + //vip cfg + int ret; + char *vip_addr_array[VIP_MAX_NUM]; + + ret = rte_strsplit(cur->vip_addr_str, strlen(cur->vip_addr_str), &vip_addr_array[0], VIP_MAX_NUM, ';'); + if (ret <= 0) { + fprintf(stdout, "vip_cfg_handler nb_vip is 0, not set vip_addr or set invalid vip_addr %s\n", + cur->vip_addr_str); + return 1; + } + + cur->nb_vip = ret; + + cur->vip_addr_array = (char **)calloc(cur->nb_vip, sizeof(char *)); + if (cur->vip_addr_array == NULL) { + fprintf(stderr, "vip_cfg_handler malloc failed\n"); + goto err; + } + + memcpy(cur->vip_addr_array, vip_addr_array, cur->nb_vip * sizeof(char *)); + + return 1; + +err: + cur->nb_vip = 0; + if (cur->vip_addr_array) { + free(cur->vip_addr_array); + cur->vip_addr_array = NULL; + } + + return 0; +} + +#ifdef INET6 +static int +vip6_cfg_handler(struct ff_port_cfg *cur) +{ + //vip6 cfg + int ret; + char *vip_addr6_array[VIP_MAX_NUM]; + + ret = rte_strsplit(cur->vip_addr6_str, strlen(cur->vip_addr6_str), + &vip_addr6_array[0], VIP_MAX_NUM, ';'); + if (ret == 0) { + fprintf(stdout, "vip6_cfg_handler nb_vip6 is 0, not set vip_addr6 or set invalid vip_addr6 %s\n", + cur->vip_addr6_str); + return 1; + } + + cur->nb_vip6 = ret; + + cur->vip_addr6_array = (char **) calloc(cur->nb_vip6, sizeof(char *)); + if (cur->vip_addr6_array == NULL) { + fprintf(stderr, "vip6_cfg_handler malloc failed\n"); + goto fail; + } + + memcpy(cur->vip_addr6_array, vip_addr6_array, cur->nb_vip6 * sizeof(char *)); + + return 1; + +fail: + cur->nb_vip6 = 0; + if (cur->vip_addr6_array) { + free(cur->vip_addr6_array); + cur->vip_addr6_array = NULL; + } + + return 0; +} +#endif + static int port_cfg_handler(struct ff_config *cfg, const char *section, const char *name, const char *value) { @@ -414,7 +490,9 @@ port_cfg_handler(struct ff_config *cfg, const char *section, cur->port_id = portid; } - if (strcmp(name, "addr") == 0) { + if (strcmp(name, "if_name") == 0) { + cur->ifname = strdup(value); + } else if (strcmp(name, "addr") == 0) { cur->addr = strdup(value); } else if (strcmp(name, "netmask") == 0) { cur->netmask = strdup(value); @@ -422,26 +500,33 @@ port_cfg_handler(struct ff_config *cfg, const char *section, cur->broadcast = strdup(value); } else if (strcmp(name, "gateway") == 0) { cur->gateway = strdup(value); - } else if (strcmp(name, "pcap") == 0) { - cur->pcap = strdup(value); } else if (strcmp(name, "lcore_list") == 0) { return parse_port_lcore_list(cur, value); } else if (strcmp(name, "slave_port_list") == 0) { return parse_port_slave_list(cur, value); + } else if (strcmp(name, "vip_addr") == 0) { + cur->vip_addr_str = strdup(value); + if (cur->vip_addr_str) { + return vip_cfg_handler(cur); + } + } else if (strcmp(name, "vip_ifname") == 0) { + cur->vip_ifname = strdup(value); } #ifdef INET6 - else if (0 == strcmp(name, "addr6")) - { + else if (0 == strcmp(name, "addr6")) { cur->addr6_str = strdup(value); - } - else if (0 == strcmp(name, "prefix_len")) - { + } else if (0 == strcmp(name, "prefix_len")) { cur->prefix_len = atoi(value); - } - else if (0 == strcmp(name, "gateway6")) - { + } else if (0 == strcmp(name, "gateway6")) { cur->gateway6_str = strdup(value); + } else if (strcmp(name, "vip_addr6") == 0) { + cur->vip_addr6_str = strdup(value); + if (cur->vip_addr6_str) { + return vip6_cfg_handler(cur); + } + } else if (0 == strcmp(name, "vip_prefix_len")) { + cur->vip_prefix_len = atoi(value); } #endif @@ -571,7 +656,9 @@ ini_parse_handler(void* user, const char* section, const char* name, printf("[%s]: %s=%s\n", section, name, value); #define MATCH(s, n) strcmp(section, s) == 0 && strcmp(name, n) == 0 - if (MATCH("dpdk", "channel")) { + if (MATCH("dpdk", "log_level")) { + pconfig->dpdk.log_level = atoi(value); + } else if (MATCH("dpdk", "channel")) { pconfig->dpdk.nb_channel = atoi(value); } else if (MATCH("dpdk", "memory")) { pconfig->dpdk.memory = atoi(value); @@ -640,9 +727,9 @@ ini_parse_handler(void* user, const char* section, const char* name, return bond_cfg_handler(pconfig, section, name, value); } else if (strcmp(section, "pcap") == 0) { if (strcmp(name, "snaplen") == 0) { - pconfig->pcap.snap_len = (uint16_t)atoi(value); + pconfig->pcap.snap_len = (uint16_t)atoi(value); } else if (strcmp(name, "savelen") == 0) { - pconfig->pcap.save_len = (uint32_t)atoi(value); + pconfig->pcap.save_len = (uint32_t)atoi(value); } else if (strcmp(name, "enable") == 0) { pconfig->pcap.enable = (uint16_t)atoi(value); } else if (strcmp(name, "savepath") == 0) { @@ -675,6 +762,10 @@ dpdk_args_setup(struct ff_config *cfg) sprintf(temp, "-m%d", cfg->dpdk.memory); dpdk_argv[n++] = strdup(temp); } + if (cfg->dpdk.log_level) { + sprintf(temp, "--log-level=%d", cfg->dpdk.log_level); + dpdk_argv[n++] = strdup(temp); + } if (cfg->dpdk.proc_type) { sprintf(temp, "--proc-type=%s", cfg->dpdk.proc_type); dpdk_argv[n++] = strdup(temp); @@ -688,8 +779,14 @@ dpdk_args_setup(struct ff_config *cfg) dpdk_argv[n++] = strdup(temp); } if (cfg->dpdk.pci_whitelist) { - sprintf(temp, "--pci-whitelist=%s", cfg->dpdk.pci_whitelist); - dpdk_argv[n++] = strdup(temp); + char* token; + char* rest = cfg->dpdk.pci_whitelist; + + while ((token = strtok_r(rest, ",", &rest))){ + sprintf(temp, "--pci-whitelist=%s", token); + dpdk_argv[n++] = strdup(temp); + } + } if (cfg->dpdk.nb_vdev) { @@ -785,6 +882,7 @@ dpdk_args_setup(struct ff_config *cfg) for (i=0; i struct ff_port_cfg*) struct ff_port_cfg *port_cfgs; struct ff_vdev_cfg *vdev_cfgs; diff --git a/lib/ff_dpdk_if.c b/lib/ff_dpdk_if.c index 1a93b5f2a..ea47eb8ed 100644 --- a/lib/ff_dpdk_if.c +++ b/lib/ff_dpdk_if.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2017 THL A29 Limited, a Tencent company. + * Copyright (C) 2017-2021 THL A29 Limited, a Tencent company. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -78,6 +78,7 @@ static int numa_on; static unsigned idle_sleep; static unsigned pkt_tx_delay; +static uint64_t usr_cb_tsc; static struct rte_timer freebsd_clock; @@ -357,7 +358,7 @@ init_mem_pool(void) } else { printf("create mbuf pool on socket %d\n", socketid); } - + #ifdef FF_USE_PAGE_ARRAY nb_mbuf = RTE_ALIGN_CEIL ( nb_ports*nb_lcores*MAX_PKT_BURST + @@ -704,6 +705,9 @@ init_port_start(void) port_conf.txmode.offloads |= DEV_TX_OFFLOAD_TCP_TSO; pconf->hw_features.tx_tso = 1; } + else { + printf("TSO is not supported\n"); + } } else { printf("TSO is disabled\n"); } @@ -736,7 +740,7 @@ init_port_start(void) uint16_t q; for (q = 0; q < nb_queues; q++) { if (numa_on) { - uint16_t lcore_id = lcore_conf.port_cfgs[port_id].lcore_list[q]; + uint16_t lcore_id = lcore_conf.port_cfgs[u_port_id].lcore_list[q]; socketid = rte_lcore_to_socket_id(lcore_id); } mbuf_pool = pktmbuf_pool[socketid]; @@ -792,8 +796,8 @@ init_port_start(void) if (ret < 0) { return ret; } - //RSS reta update will failed when enable flow isolate - #ifndef FF_FLOW_ISOLATE +//RSS reta update will failed when enable flow isolate +#ifndef FF_FLOW_ISOLATE if (nb_queues > 1) { /* set HW rss hash function to Toeplitz. */ if (!rte_eth_dev_filter_supported(port_id, RTE_ETH_FILTER_HASH)) { @@ -810,7 +814,7 @@ init_port_start(void) set_rss_table(port_id, dev_info.reta_size, nb_queues); } - #endif +#endif /* Enable RX in promiscuous mode for the Ethernet device. */ if (ff_global_cfg.dpdk.promiscuous) { @@ -848,7 +852,7 @@ init_clock(void) return 0; } -#ifdef FF_FLOW_ISOLATE +#if defined(FF_FLOW_ISOLATE) || defined(FF_FDIR) /** Print a message out of a flow error. */ static int port_flow_complain(struct rte_flow_error *error) @@ -875,7 +879,7 @@ port_flow_complain(struct rte_flow_error *error) const char *errstr; char buf[32]; int err = rte_errno; - + if ((unsigned int)error->type >= RTE_DIM(errstrlist) || !errstrlist[error->type]) errstr = "unknown type"; @@ -889,12 +893,15 @@ port_flow_complain(struct rte_flow_error *error) rte_strerror(err)); return -err; } +#endif + +#ifdef FF_FLOW_ISOLATE static int port_flow_isolate(uint16_t port_id, int set) { struct rte_flow_error error; - + /* Poisoning to make sure PMDs update it in case of error. */ memset(&error, 0x66, sizeof(error)); if (rte_flow_isolate(port_id, set, &error)) @@ -1055,6 +1062,110 @@ init_flow(uint16_t port_id, uint16_t tcp_port) { #endif +#ifdef FF_FDIR +/* + * Flow director allows the traffic to specific port to be processed on the + * specific queue. Unlike FF_FLOW_ISOLATE, the FF_FDIR implementation uses + * general flow rule so that most FDIR supported NIC will support. The best + * using case of FDIR is (but not limited to), using multiple processes to + * listen on different ports. + * + * This function can be called either in FSTACK or in end-application. + * + * Example: + * Given 2 fstack instances A and B. Instance A listens on port 80, and + * instance B listens on port 81. We want to process the traffic to port 80 + * on rx queue 0, and the traffic to port 81 on rx queue 1. + * // port 80 rx queue 0 + * ret = fdir_add_tcp_flow(port_id, 0, FF_FLOW_INGRESS, 0, 80); + * // port 81 rx queue 1 + * ret = fdir_add_tcp_flow(port_id, 1, FF_FLOW_INGRESS, 0, 81); + */ +#define FF_FLOW_EGRESS 1 +#define FF_FLOW_INGRESS 2 +/** + * Create a flow rule that moves packets with matching src and dest tcp port + * to the target queue. + * + * This function uses general flow rules and doesn't rely on the flow_isolation + * that not all the FDIR capable NIC support. + * + * @param port_id + * The selected port. + * @param queue + * The target queue. + * @param dir + * The direction of the traffic. + * 1 for egress, 2 for ingress and sum(1+2) for both. + * @param tcp_sport + * The src tcp port to match. + * @param tcp_dport + * The dest tcp port to match. + * + */ +static int +fdir_add_tcp_flow(uint16_t port_id, uint16_t queue, uint16_t dir, + uint16_t tcp_sport, uint16_t tcp_dport) +{ + struct rte_flow_attr attr; + struct rte_flow_item flow_pattern[4]; + struct rte_flow_action flow_action[2]; + struct rte_flow *flow = NULL; + struct rte_flow_action_queue flow_action_queue = { .index = queue }; + struct rte_flow_item_tcp tcp_spec; + struct rte_flow_item_tcp tcp_mask; + struct rte_flow_error rfe; + int res; + + memset(flow_pattern, 0, sizeof(flow_pattern)); + memset(flow_action, 0, sizeof(flow_action)); + + /* + * set the rule attribute. + */ + memset(&attr, 0, sizeof(struct rte_flow_attr)); + attr.ingress = ((dir & FF_FLOW_INGRESS) > 0); + attr.egress = ((dir & FF_FLOW_EGRESS) > 0); + + /* + * create the action sequence. + * one action only, move packet to queue + */ + flow_action[0].type = RTE_FLOW_ACTION_TYPE_QUEUE; + flow_action[0].conf = &flow_action_queue; + flow_action[1].type = RTE_FLOW_ACTION_TYPE_END; + + flow_pattern[0].type = RTE_FLOW_ITEM_TYPE_ETH; + flow_pattern[1].type = RTE_FLOW_ITEM_TYPE_IPV4; + + /* + * set the third level of the pattern (TCP). + */ + memset(&tcp_spec, 0, sizeof(struct rte_flow_item_tcp)); + memset(&tcp_mask, 0, sizeof(struct rte_flow_item_tcp)); + tcp_spec.hdr.src_port = htons(tcp_sport); + tcp_mask.hdr.src_port = (tcp_sport == 0 ? 0: 0xffff); + tcp_spec.hdr.dst_port = htons(tcp_dport); + tcp_mask.hdr.dst_port = (tcp_dport == 0 ? 0: 0xffff); + flow_pattern[2].type = RTE_FLOW_ITEM_TYPE_TCP; + flow_pattern[2].spec = &tcp_spec; + flow_pattern[2].mask = &tcp_mask; + + flow_pattern[3].type = RTE_FLOW_ITEM_TYPE_END; + + res = rte_flow_validate(port_id, &attr, flow_pattern, flow_action, &rfe); + if (res) + return (1); + + flow = rte_flow_create(port_id, &attr, flow_pattern, flow_action, &rfe); + if (!flow) + return port_flow_complain(&rfe); + + return (0); +} + +#endif + int ff_dpdk_init(int argc, char **argv) { @@ -1097,8 +1208,8 @@ ff_dpdk_init(int argc, char **argv) #ifdef FF_USE_PAGE_ARRAY ff_mmap_init(); #endif - -#ifdef FF_FLOW_ISOLATE + +#ifdef FF_FLOW_ISOLATE // run once in primary process if (0 == lcore_conf.tx_queue_id[0]){ ret = port_flow_isolate(0, 1); @@ -1106,7 +1217,7 @@ ff_dpdk_init(int argc, char **argv) rte_exit(EXIT_FAILURE, "init_port_isolate failed\n"); } #endif - + ret = init_port_start(); if (ret < 0) { rte_exit(EXIT_FAILURE, "init_port_start failed\n"); @@ -1114,8 +1225,8 @@ ff_dpdk_init(int argc, char **argv) init_clock(); #ifdef FF_FLOW_ISOLATE - //Only give a example usage: port_id=0, tcp_port= 80. - //Recommend: + //Only give a example usage: port_id=0, tcp_port= 80. + //Recommend: //1. init_flow should replace `set_rss_table` in `init_port_start` loop, This can set all NIC's port_id_list instead only 0 device(port_id). //2. using config options `tcp_port` replace magic number of 80 ret = init_flow(0, 80); @@ -1123,6 +1234,16 @@ ff_dpdk_init(int argc, char **argv) rte_exit(EXIT_FAILURE, "init_port_flow failed\n"); } #endif + +#ifdef FF_FDIR + /* + * Refer function header section for usage. + */ + ret = fdir_add_tcp_flow(0, 0, FF_FLOW_INGRESS, 0, 80); + if (ret) + rte_exit(EXIT_FAILURE, "fdir_add_tcp_flow failed\n"); +#endif + return 0; } @@ -1192,7 +1313,8 @@ protocol_filter(const void *data, uint16_t len) if(ether_type == RTE_ETHER_TYPE_ARP) return FILTER_ARP; -#ifdef INET6 +#if (!defined(__FreeBSD__) && defined(INET6) ) || \ + ( defined(__FreeBSD__) && defined(INET6) && defined(FF_KNI)) if (ether_type == RTE_ETHER_TYPE_IPV6) { return ff_kni_proto_filter(data, len, ether_type); @@ -1313,12 +1435,14 @@ process_packets(uint16_t port_id, uint16_t queue_id, struct rte_mbuf **bufs, uint16_t len = rte_pktmbuf_data_len(rtem); if (!pkts_from_ring) { - ff_traffic.rx_packets++; - ff_traffic.rx_bytes += len; + ff_traffic.rx_packets += rtem->nb_segs; + ff_traffic.rx_bytes += rte_pktmbuf_pkt_len(rtem); } if (!pkts_from_ring && packet_dispatcher) { + uint64_t cur_tsc = rte_rdtsc(); int ret = (*packet_dispatcher)(data, &len, queue_id, nb_queues); + usr_cb_tsc += rte_rdtsc() - cur_tsc; if (ret == FF_DISPATCH_RESPONSE) { rte_pktmbuf_pkt_len(rtem) = rte_pktmbuf_data_len(rtem) = len; /* @@ -1423,7 +1547,7 @@ process_dispatch_ring(uint16_t port_id, uint16_t queue_id, process_packets(port_id, queue_id, pkts_burst, nb_rb, ctx, 1); } - return 0; + return nb_rb; } static inline void @@ -1521,7 +1645,7 @@ handle_ipfw_msg(struct ff_msg *msg) case FF_IPFW_SET: ret = ff_setsockopt_freebsd(fd, msg->ipfw.level, msg->ipfw.optname, msg->ipfw.optval, - *(msg->ipfw.optlen)); + *(msg->ipfw.optlen)); break; default: ret = -1; @@ -1660,11 +1784,11 @@ send_burst(struct lcore_conf *qconf, uint16_t n, uint8_t port) if (unlikely(ff_global_cfg.pcap.enable)) { uint16_t i; for (i = 0; i < n; i++) { - ff_dump_packets( ff_global_cfg.pcap.save_path, m_table[i], + ff_dump_packets( ff_global_cfg.pcap.save_path, m_table[i], ff_global_cfg.pcap.snap_len, ff_global_cfg.pcap.save_len); } } - + ret = rte_eth_tx_burst(port, queueid, m_table, n); ff_traffic.tx_packets += ret; uint16_t i; @@ -1674,7 +1798,7 @@ send_burst(struct lcore_conf *qconf, uint16_t n, uint8_t port) if (qconf->tx_mbufs[port].bsd_m_table[i]) ff_enq_tx_bsdmbuf(port, qconf->tx_mbufs[port].bsd_m_table[i], m_table[i]->nb_segs); #endif - } + } if (unlikely(ret < n)) { do { rte_pktmbuf_free(m_table[ret]); @@ -1716,7 +1840,7 @@ ff_dpdk_if_send(struct ff_dpdk_if_context *ctx, void *m, #ifdef FF_USE_PAGE_ARRAY struct lcore_conf *qconf = &lcore_conf; int len = 0; - + len = ff_if_send_onepkt(ctx, m,total); if (unlikely(len == MAX_PKT_BURST)) { send_burst(qconf, MAX_PKT_BURST, ctx->port_id); @@ -1868,6 +1992,7 @@ main_loop(void *arg) idle = 1; sys_tsc = 0; usr_tsc = 0; + usr_cb_tsc = 0; /* * TX burst queue drain @@ -1904,7 +2029,7 @@ main_loop(void *arg) } #endif - process_dispatch_ring(port_id, queue_id, pkts_burst, ctx); + idle &= !process_dispatch_ring(port_id, queue_id, pkts_burst, ctx); nb_rx = rte_eth_rx_burst(port_id, queue_id, pkts_burst, MAX_PKT_BURST); @@ -1949,12 +2074,13 @@ main_loop(void *arg) end_tsc = idle_sleep_tsc; } + usr_tsc = usr_cb_tsc; if (usch_tsc == cur_tsc) { - usr_tsc = idle_sleep_tsc - div_tsc; + usr_tsc += idle_sleep_tsc - div_tsc; } if (!idle) { - sys_tsc = div_tsc - cur_tsc; + sys_tsc = div_tsc - cur_tsc - usr_cb_tsc; ff_top_status.sys_tsc += sys_tsc; } diff --git a/lib/ff_syscall_wrapper.c b/lib/ff_syscall_wrapper.c index 1bcfecacc..d3c4d4d59 100644 --- a/lib/ff_syscall_wrapper.c +++ b/lib/ff_syscall_wrapper.c @@ -1,6 +1,6 @@ /* * Copyright (c) 2010 Kip Macy. All rights reserved. - * Copyright (C) 2017 THL A29 Limited, a Tencent company. + * Copyright (C) 2017-2021 THL A29 Limited, a Tencent company. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -87,6 +87,10 @@ #define LINUX_IP_TTL 2 #define LINUX_IP_HDRINCL 3 #define LINUX_IP_OPTIONS 4 +#define LINUX_IP_RECVTTL 12 +#define LINUX_IP_RECVTOS 13 +#define LINUX_IP_TRANSPARENT 19 +#define LINUX_IP_MINTTL 21 #define LINUX_IP_MULTICAST_IF 32 #define LINUX_IP_MULTICAST_TTL 33 @@ -94,6 +98,10 @@ #define LINUX_IP_ADD_MEMBERSHIP 35 #define LINUX_IP_DROP_MEMBERSHIP 36 +#define LINUX_IPV6_V6ONLY 26 +#define LINUX_IPV6_RECVPKTINFO 49 +#define LINUX_IPV6_TRANSPARENT 75 + #define LINUX_TCP_NODELAY 1 #define LINUX_TCP_MAXSEG 2 #define LINUX_TCP_KEEPIDLE 4 @@ -194,6 +202,44 @@ struct linux_msghdr { /* msghdr define end */ +/* cmsghdr define start */ + +struct linux_cmsghdr +{ + size_t cmsg_len; /* Length of data in cmsg_data plus length + of cmsghdr structure. + !! The type should be socklen_t but the + definition of the kernel is incompatible + with this. */ + int cmsg_level; /* Originating protocol. */ + int cmsg_type; /* Protocol specific type. */ +}; + +/* + * LINUX_CMSG_XXXX has the same effect as FreeBSD's CMSG_XXXX, + * because aligned to 8 bytes, but still redefine them. + */ +#define LINUX_CMSG_DATA(cmsg) ((unsigned char *)(cmsg) + \ + _ALIGN(sizeof(struct linux_cmsghdr))) +#define LINUX_CMSG_SPACE(l) (_ALIGN(sizeof(struct linux_cmsghdr)) + _ALIGN(l)) +#define LINUX_CMSG_LEN(l) (_ALIGN(sizeof(struct linux_cmsghdr)) + (l)) + +#define LINUX_CMSG_FIRSTHDR(mhdr) \ + ((mhdr)->msg_controllen >= sizeof(struct linux_cmsghdr) ? \ + (struct linux_cmsghdr *)(mhdr)->msg_control : \ + (struct linux_cmsghdr *)0) + +#define LINUX_CMSG_NXTHDR(mhdr, cmsg) \ + ((char *)(cmsg) == (char *)0 ? LINUX_CMSG_FIRSTHDR(mhdr) : \ + ((char *)(cmsg) + _ALIGN(((struct linux_cmsghdr *)(cmsg))->cmsg_len) + \ + _ALIGN(sizeof(struct linux_cmsghdr)) > \ + (char *)(mhdr)->msg_control + (mhdr)->msg_controllen) ? \ + (struct linux_cmsghdr *)0 : \ + (struct linux_cmsghdr *)(void *)((char *)(cmsg) + \ + _ALIGN(((struct linux_cmsghdr *)(cmsg))->cmsg_len))) + +/* cmsghdr define end */ + extern int sendit(struct thread *td, int s, struct msghdr *mp, int flags); static long @@ -377,7 +423,30 @@ ip_opt_convert(int optname) case LINUX_IP_ADD_MEMBERSHIP: return IP_ADD_MEMBERSHIP; case LINUX_IP_DROP_MEMBERSHIP: - return IP_DROP_MEMBERSHIP; + return IP_DROP_MEMBERSHIP; + case LINUX_IP_RECVTTL: + return IP_RECVTTL; + case LINUX_IP_RECVTOS: + return IP_RECVTOS; + case LINUX_IP_TRANSPARENT: + return IP_BINDANY; + case LINUX_IP_MINTTL: + return IP_MINTTL; + default: + return optname; + } +} + +static int +ip6_opt_convert(int optname) +{ + switch(optname) { + case LINUX_IPV6_V6ONLY: + return IPV6_V6ONLY; + case LINUX_IPV6_RECVPKTINFO: + return IPV6_RECVPKTINFO; + case LINUX_IPV6_TRANSPARENT: + return IPV6_BINDANY; default: return optname; } @@ -414,6 +483,8 @@ linux2freebsd_opt(int level, int optname) return so_opt_convert(optname); case IPPROTO_IP: return ip_opt_convert(optname); + case IPPROTO_IPV6: + return ip6_opt_convert(optname); case IPPROTO_TCP: return tcp_opt_convert(optname); default: @@ -425,7 +496,7 @@ static void linux2freebsd_sockaddr(const struct linux_sockaddr *linux, socklen_t addrlen, struct sockaddr *freebsd) { - if (linux == NULL) { + if (linux == NULL || freebsd == NULL) { return; } @@ -433,20 +504,201 @@ linux2freebsd_sockaddr(const struct linux_sockaddr *linux, freebsd->sa_family = linux->sa_family == LINUX_AF_INET6 ? AF_INET6 : linux->sa_family; freebsd->sa_len = addrlen; - bcopy(linux->sa_data, freebsd->sa_data, addrlen - sizeof(linux->sa_family)); + if (linux->sa_data != freebsd->sa_data) { + bcopy(linux->sa_data, freebsd->sa_data, addrlen - sizeof(linux->sa_family)); + } } static void freebsd2linux_sockaddr(struct linux_sockaddr *linux, struct sockaddr *freebsd) { - if (linux == NULL) { + if (linux == NULL || freebsd == NULL) { return; } + /* #linux and #freebsd may point to the same address */ + if (linux->sa_data != freebsd->sa_data) { + bcopy(freebsd->sa_data, linux->sa_data, freebsd->sa_len - sizeof(linux->sa_family)); + } linux->sa_family = freebsd->sa_family == AF_INET6 ? LINUX_AF_INET6 : freebsd->sa_family; +} - bcopy(freebsd->sa_data, linux->sa_data, freebsd->sa_len - sizeof(linux->sa_family)); +static inline int +freebsd2linux_cmsghdr(struct linux_msghdr *linux_msg, const struct msghdr *freebsd_msg) +{ + struct cmsghdr *freebsd_cmsg = CMSG_FIRSTHDR(freebsd_msg); + struct linux_cmsghdr *linux_cmsg = LINUX_CMSG_FIRSTHDR(linux_msg); + + while (freebsd_cmsg && linux_cmsg) { + unsigned char *freebsd_optval = CMSG_DATA(freebsd_cmsg); + unsigned char *linux_optval = LINUX_CMSG_DATA(linux_cmsg); + + /* + * The address of linux_cmsg and freebsd_cmsg coincides while recvmsg, + * but the position of the variable pointer is different, + * and the assignment must be reversed. + * + * Although sizeof(struct linux_msghdr) and sizeof(struct msghdr) have different lengths, + * but cmsg_data both skip the same 16 bytes,both aligned to 8 bytes. + */ + linux_cmsg->cmsg_type = freebsd_cmsg->cmsg_type; + linux_cmsg->cmsg_level = freebsd_cmsg->cmsg_level; + linux_cmsg->cmsg_len = LINUX_CMSG_LEN(freebsd_cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr))); + + /* + * The freebsd_msg's cmsg_level and cmsg_type has been moddied while recvmsg, + * must use linux_cmsg to judge and calculate data length. + * And don't copy other the bytes that used aligned. + */ + switch (linux_cmsg->cmsg_level) { + case IPPROTO_IP: + switch (linux_cmsg->cmsg_type) { + case IP_RECVTOS: + linux_cmsg->cmsg_type = LINUX_IP_TOS; + *linux_optval = *freebsd_optval; + break; + case IP_RECVTTL: + linux_cmsg->cmsg_type = LINUX_IP_TTL; + *linux_optval = *freebsd_optval; + break; + /*case XXXX: + break;*/ + default: + memcpy(linux_optval, freebsd_optval, linux_cmsg->cmsg_len - sizeof(struct linux_cmsghdr)); + break; + } + + break; + default: + memcpy(linux_optval, freebsd_optval, linux_cmsg->cmsg_len - sizeof(struct linux_cmsghdr)); + break; + } + + linux_cmsg = LINUX_CMSG_NXTHDR(linux_msg, linux_cmsg); + freebsd_cmsg = CMSG_NXTHDR(freebsd_msg, freebsd_cmsg); + } + + return 0; +} + +static inline int +linux2freebsd_cmsg(const struct linux_msghdr *linux_msg, struct msghdr *freebsd_msg) +{ + struct cmsghdr *freebsd_cmsg = CMSG_FIRSTHDR(freebsd_msg); + struct linux_cmsghdr *linux_cmsg = LINUX_CMSG_FIRSTHDR(linux_msg); + + while (freebsd_cmsg && linux_cmsg) { + unsigned char *freebsd_optval = CMSG_DATA(freebsd_cmsg); + unsigned char *linux_optval = LINUX_CMSG_DATA(linux_cmsg); + + freebsd_cmsg->cmsg_type = linux_cmsg->cmsg_type; + freebsd_cmsg->cmsg_level = linux_cmsg->cmsg_level; + freebsd_cmsg->cmsg_len = CMSG_LEN(linux_cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct linux_cmsghdr))); + + switch (linux_cmsg->cmsg_level) { + case IPPROTO_IP: + switch (linux_cmsg->cmsg_type) { + case LINUX_IP_TOS: + freebsd_cmsg->cmsg_type = IP_TOS; + freebsd_cmsg->cmsg_len = CMSG_LEN(sizeof(char)); + + if (linux_cmsg->cmsg_len == LINUX_CMSG_LEN(sizeof(int))) { + *freebsd_optval = *(int *)linux_optval; + } else if (linux_cmsg->cmsg_len == LINUX_CMSG_LEN(sizeof(char))) { + *freebsd_optval = *linux_optval; + } + + break; + case LINUX_IP_TTL: + freebsd_cmsg->cmsg_type = IP_TTL; + freebsd_cmsg->cmsg_len = CMSG_LEN(sizeof(char)); + + *freebsd_optval = *(int *)linux_optval; + + break; + /*case XXXX: + break;*/ + default: + memcpy(freebsd_optval, linux_optval, linux_cmsg->cmsg_len - sizeof(struct linux_cmsghdr)); + break; + } + + break; + default: + memcpy(freebsd_optval, linux_optval, linux_cmsg->cmsg_len - sizeof(struct linux_cmsghdr)); + break; + } + + linux_cmsg = LINUX_CMSG_NXTHDR(linux_msg, linux_cmsg); + freebsd_cmsg = CMSG_NXTHDR(freebsd_msg, freebsd_cmsg); + } + + return 0; +} + +/* + * While sendmsg, need convert msg_name and msg_control from Linux to FreeBSD. + * While recvmsg, need convert msg_name and msg_control from FreeBSD to Linux. + */ +static int +freebsd2linux_msghdr(struct linux_msghdr *linux_msg, struct msghdr *freebsd_msg, int send_flag) +{ + if (linux_msg == NULL || freebsd_msg == NULL) { + return -1; + } + + if (linux_msg->msg_name && freebsd_msg->msg_name && !send_flag) { + linux_msg->msg_name = freebsd_msg->msg_name; + freebsd2linux_sockaddr(linux_msg->msg_name, freebsd_msg->msg_name); + linux_msg->msg_namelen = freebsd_msg->msg_namelen; + } + + linux_msg->msg_iov = freebsd_msg->msg_iov; + linux_msg->msg_iovlen = freebsd_msg->msg_iovlen; + + if(freebsd_msg->msg_control && linux_msg->msg_control && !send_flag) { + freebsd2linux_cmsghdr(linux_msg, freebsd_msg); + linux_msg->msg_controllen = freebsd_msg->msg_controllen; + } + + linux_msg->msg_flags = freebsd_msg->msg_flags; + + return 0; +} + +static int +linux2freebsd_msghdr(const struct linux_msghdr *linux_msg, struct msghdr *freebsd_msg, int send_flag) +{ + int ret = 0; + + if (linux_msg == NULL || freebsd_msg == NULL) { + return -1;; + } + + if (linux_msg->msg_name && freebsd_msg->msg_name && send_flag) { + linux2freebsd_sockaddr(linux_msg->msg_name, linux_msg->msg_namelen, freebsd_msg->msg_name); + } else { + freebsd_msg->msg_name = linux_msg->msg_name; + } + freebsd_msg->msg_namelen = linux_msg->msg_namelen; + + freebsd_msg->msg_iov = linux_msg->msg_iov; + freebsd_msg->msg_iovlen = linux_msg->msg_iovlen; + + freebsd_msg->msg_controllen = linux_msg->msg_controllen; + if (linux_msg->msg_control && send_flag) { + ret = linux2freebsd_cmsg(linux_msg, freebsd_msg); + if(ret < 0) { + return ret; + } + } else { + freebsd_msg->msg_control = linux_msg->msg_control; + } + + freebsd_msg->msg_flags = linux_msg->msg_flags; + + return 0; } int @@ -604,7 +856,7 @@ ff_close(int fd) { int rc; - if ((rc = kern_close(curthread, fd))) + if ((rc = kern_close(curthread, fd))) goto kern_fail; return (rc); @@ -619,7 +871,7 @@ ff_read(int fd, void *buf, size_t nbytes) struct uio auio; struct iovec aiov; int rc; - + if (nbytes > INT_MAX) { rc = EINVAL; goto kern_fail; @@ -686,7 +938,7 @@ ff_write(int fd, const void *buf, size_t nbytes) if ((rc = kern_writev(curthread, fd, &auio))) goto kern_fail; rc = curthread->td_retval[0]; - + return (rc); kern_fail: ff_os_errno(rc); @@ -709,7 +961,7 @@ ff_writev(int fd, const struct iovec *iov, int iovcnt) if ((rc = kern_writev(curthread, fd, &auio))) goto kern_fail; rc = curthread->td_retval[0]; - + return (rc); kern_fail: ff_os_errno(rc); @@ -760,25 +1012,39 @@ kern_fail: ssize_t ff_sendmsg(int s, const struct msghdr *msg, int flags) { - int rc; + int rc, ret; struct sockaddr_storage freebsd_sa; - void *linux_sa = msg->msg_name; + struct msghdr freebsd_msg; + struct cmsghdr *freebsd_cmsg = NULL; - if (linux_sa != NULL) { - linux2freebsd_sockaddr(linux_sa, - sizeof(struct linux_sockaddr), (struct sockaddr *)&freebsd_sa); - __DECONST(struct msghdr *, msg)->msg_name = &freebsd_sa; + freebsd_msg.msg_name = &freebsd_sa; + if ((__DECONST(struct linux_msghdr *, msg))->msg_control) { + freebsd_cmsg = malloc((__DECONST(struct linux_msghdr *, msg))->msg_controllen, NULL, 0); + if (freebsd_cmsg == NULL) { + rc = ENOMEM; + goto kern_fail; + } + } + freebsd_msg.msg_control = freebsd_cmsg; + + ret = linux2freebsd_msghdr((const struct linux_msghdr *)msg, &freebsd_msg, 1); + if (ret < 0) { + rc = EINVAL; + goto kern_fail; } - rc = sendit(curthread, s, __DECONST(struct msghdr *, msg), flags); - - __DECONST(struct msghdr *, msg)->msg_name = linux_sa; - + rc = sendit(curthread, s, &freebsd_msg, flags); if (rc) goto kern_fail; rc = curthread->td_retval[0]; + freebsd2linux_msghdr(__DECONST(struct linux_msghdr *, msg), &freebsd_msg, 1); + + if (freebsd_cmsg) { + free(freebsd_cmsg, NULL); + } + return (rc); kern_fail: ff_os_errno(rc); @@ -819,7 +1085,7 @@ ff_recvfrom(int s, void *buf, size_t len, int flags, if (fromlen != NULL) *fromlen = msg.msg_namelen; - if (from) + if (from && msg.msg_namelen != 0) freebsd2linux_sockaddr(from, (struct sockaddr *)&bsdaddr); return (rc); @@ -828,27 +1094,25 @@ kern_fail: return (-1); } -/* - * It is considered here that the upper 4 bytes of - * msg->iovlen and msg->msg_controllen in linux_msghdr are 0. - */ ssize_t ff_recvmsg(int s, struct msghdr *msg, int flags) { - int rc; - struct linux_msghdr *linux_msg = (struct linux_msghdr *)msg; + int rc, ret; + struct msghdr freebsd_msg; - msg->msg_flags = flags; + ret = linux2freebsd_msghdr((struct linux_msghdr *)msg, &freebsd_msg, 0); + if (ret < 0) { + rc = EINVAL; + goto kern_fail; + } + freebsd_msg.msg_flags = flags; - if ((rc = kern_recvit(curthread, s, msg, UIO_SYSSPACE, NULL))) { - msg->msg_flags = 0; + if ((rc = kern_recvit(curthread, s, &freebsd_msg, UIO_SYSSPACE, NULL))) { goto kern_fail; } rc = curthread->td_retval[0]; - freebsd2linux_sockaddr(linux_msg->msg_name, msg->msg_name); - linux_msg->msg_flags = msg->msg_flags; - msg->msg_flags = 0; + freebsd2linux_msghdr((struct linux_msghdr *)msg, &freebsd_msg, 0); return (rc); kern_fail: @@ -866,7 +1130,7 @@ ff_fcntl(int fd, int cmd, ...) va_start(ap, cmd); argp = va_arg(ap, uintptr_t); - va_end(ap); + va_end(ap); if ((rc = kern_fcntl(curthread, fd, cmd, argp))) goto kern_fail; @@ -897,11 +1161,11 @@ ff_accept(int s, struct linux_sockaddr * addr, if (addrlen) *addrlen = pf->sa_len; - + if(pf != NULL) free(pf, M_SONAME); return (rc); - + kern_fail: if(pf != NULL) free(pf, M_SONAME); @@ -929,7 +1193,7 @@ kern_fail: int ff_bind(int s, const struct linux_sockaddr *addr, socklen_t addrlen) { - int rc; + int rc; struct sockaddr_storage bsdaddr; linux2freebsd_sockaddr(addr, addrlen, (struct sockaddr *)&bsdaddr); @@ -974,7 +1238,7 @@ ff_getpeername(int s, struct linux_sockaddr * name, if(pf != NULL) free(pf, M_SONAME); return (rc); - + kern_fail: if(pf != NULL) free(pf, M_SONAME); @@ -1006,7 +1270,7 @@ kern_fail: return (-1); } -int +int ff_shutdown(int s, int how) { int rc; @@ -1031,7 +1295,7 @@ ff_sysctl(const int *name, u_int namelen, void *oldp, size_t *oldlenp, int rc; size_t retval; - rc = userland_sysctl(curthread, __DECONST(int *, name), namelen, oldp, oldlenp, + rc = userland_sysctl(curthread, __DECONST(int *, name), namelen, oldp, oldlenp, 1, __DECONST(void *, newp), newlen, &retval, 0); if (rc) goto kern_fail; @@ -1143,8 +1407,8 @@ kevent_copyin(void *arg, struct kevent *kevp, int count) } int -ff_kevent_do_each(int kq, const struct kevent *changelist, int nchanges, - void *eventlist, int nevents, const struct timespec *timeout, +ff_kevent_do_each(int kq, const struct kevent *changelist, int nchanges, + void *eventlist, int nevents, const struct timespec *timeout, void (*do_each)(void **, struct kevent *)) { int rc; @@ -1168,7 +1432,7 @@ ff_kevent_do_each(int kq, const struct kevent *changelist, int nchanges, kevent_copyin }; - if ((rc = kern_kevent(curthread, kq, nchanges, nevents, &k_ops, + if ((rc = kern_kevent(curthread, kq, nchanges, nevents, &k_ops, &ts))) goto kern_fail; @@ -1180,7 +1444,7 @@ kern_fail: } int -ff_kevent(int kq, const struct kevent *changelist, int nchanges, +ff_kevent(int kq, const struct kevent *changelist, int nchanges, struct kevent *eventlist, int nevents, const struct timespec *timeout) { return ff_kevent_do_each(kq, changelist, nchanges, eventlist, nevents, timeout, NULL); diff --git a/lib/ff_veth.c b/lib/ff_veth.c index 57276ae46..e0270897a 100644 --- a/lib/ff_veth.c +++ b/lib/ff_veth.c @@ -69,10 +69,17 @@ struct ff_veth_softc { in_addr_t broadcast; in_addr_t gateway; + uint8_t nb_vip; + in_addr_t vip[VIP_MAX_NUM]; + #ifdef INET6 struct in6_addr ip6; struct in6_addr gateway6; uint8_t prefix_length; + + uint8_t nb_vip6; + uint8_t vip_prefix_length; + struct in6_addr vip6[VIP_MAX_NUM]; #endif /* INET6 */ struct ff_dpdk_if_context *host_ctx; @@ -81,12 +88,26 @@ struct ff_veth_softc { static int ff_veth_config(struct ff_veth_softc *sc, struct ff_port_cfg *cfg) { + int i, j; + memcpy(sc->mac, cfg->mac, ETHER_ADDR_LEN); inet_pton(AF_INET, cfg->addr, &sc->ip); inet_pton(AF_INET, cfg->netmask, &sc->netmask); inet_pton(AF_INET, cfg->broadcast, &sc->broadcast); inet_pton(AF_INET, cfg->gateway, &sc->gateway); + if (cfg->nb_vip) { + for (i = 0, j = 0; i < cfg->nb_vip; ++i) { + if (inet_pton(AF_INET, cfg->vip_addr_array[i], &sc->vip[j])) { + j++; + } else { + printf("ff_veth_config inet_pton vip %s failed.\n", cfg->vip_addr_array[i]); + } + } + + sc->nb_vip = j; + } + #ifdef INET6 if (cfg->addr6_str) { inet_pton(AF_INET6_LINUX, cfg->addr6_str, &sc->ip6); @@ -103,6 +124,19 @@ ff_veth_config(struct ff_veth_softc *sc, struct ff_port_cfg *cfg) } else { printf("%s: No addr6 config found.\n", sc->host_ifname); } + + if (cfg->nb_vip6) { + for (i = 0, j = 0; i < cfg->nb_vip6; ++i) { + if (inet_pton(AF_INET6_LINUX, cfg->vip_addr6_array[i], &sc->vip6[j])) { + j++; + } else { + printf("ff_veth_config inet_pton vip6 %s failed.\n", cfg->vip_addr6_array[i]); + } + } + + sc->nb_vip6 = j; + sc->vip_prefix_length = cfg->vip_prefix_len == 0 ? 64 : cfg->vip_prefix_len; + } #endif /* INET6 */ return 0; @@ -299,7 +333,7 @@ ff_mbuf_get(void *p, void *m, void *data, uint16_t len) struct mbuf *mb = m_get(M_NOWAIT, MT_DATA); if (mb == NULL) { - return NULL; + return NULL; } m_extadd(mb, data, len, ff_mbuf_ext_free, m, NULL, 0, EXT_DISPOSABLE); @@ -393,6 +427,52 @@ ff_veth_set_gateway(struct ff_veth_softc *sc) (struct sockaddr *)&nm, RTF_GATEWAY, NULL, RT_DEFAULT_FIB); } +static int +ff_veth_setvaddr(struct ff_veth_softc *sc, struct ff_port_cfg *cfg) +{ + struct in_aliasreq req; + bzero(&req, sizeof req); + + if (cfg->vip_ifname) { + strlcpy(req.ifra_name, cfg->vip_ifname, IFNAMSIZ); + } else { + strlcpy(req.ifra_name, sc->ifp->if_dname, IFNAMSIZ); + } + + struct sockaddr_in sa; + bzero(&sa, sizeof(sa)); + sa.sin_len = sizeof(sa); + sa.sin_family = AF_INET; + + int i, ret; + struct socket *so = NULL; + socreate(AF_INET, &so, SOCK_DGRAM, 0, curthread->td_ucred, curthread); + + for (i = 0; i < sc->nb_vip; ++i) { + sa.sin_addr.s_addr = sc->vip[i]; + bcopy(&sa, &req.ifra_addr, sizeof(sa)); + + // Only support '255.255.255.255' netmask now + sa.sin_addr.s_addr = 0xFFFFFFFF; + bcopy(&sa, &req.ifra_mask, sizeof(sa)); + + // Only support 'x.x.x.255' broadaddr now + sa.sin_addr.s_addr = sc->vip[i] | 0xFF000000; + bcopy(&sa, &req.ifra_broadaddr, sizeof(sa)); + + ret = ifioctl(so, SIOCAIFADDR, (caddr_t)&req, curthread); + if (ret < 0) { + printf("ff_veth_setvaddr ifioctl SIOCAIFADDR error\n"); + goto done; + } + } + +done: + sofree(so); + + return ret; +} + #ifdef INET6 static int ff_veth_setaddr6(struct ff_veth_softc *sc) @@ -410,7 +490,8 @@ ff_veth_setaddr6(struct ff_veth_softc *sc) uint8_t mask_size_mod = sc->prefix_length % 8; if (mask_size_mod) { - ifr6.ifra_prefixmask.sin6_addr.__u6_addr.__u6_addr8[sc->prefix_length / 8] = ((1 << mask_size_mod) - 1) << (8 - mask_size_mod); + ifr6.ifra_prefixmask.sin6_addr.__u6_addr.__u6_addr8[sc->prefix_length / 8] = \ + ((1 << mask_size_mod) - 1) << (8 - mask_size_mod); } ifr6.ifra_lifetime.ia6t_pltime = ifr6.ifra_lifetime.ia6t_vltime = ND6_INFINITE_LIFETIME; @@ -442,6 +523,52 @@ ff_veth_set_gateway6(struct ff_veth_softc *sc) return rtrequest_fib(RTM_ADD, (struct sockaddr *)&dst, (struct sockaddr *)&gw, (struct sockaddr *)&nm, RTF_GATEWAY, NULL, RT_DEFAULT_FIB); } + +static int +ff_veth_setvaddr6(struct ff_veth_softc *sc, struct ff_port_cfg *cfg) +{ + struct in6_aliasreq ifr6; + bzero(&ifr6, sizeof(ifr6)); + + if (cfg->vip_ifname) { + strlcpy(ifr6.ifra_name, cfg->vip_ifname, IFNAMSIZ); + } else { + strlcpy(ifr6.ifra_name, sc->ifp->if_dname, IFNAMSIZ); + } + + ifr6.ifra_addr.sin6_len = sizeof ifr6.ifra_addr; + ifr6.ifra_addr.sin6_family = AF_INET6; + + ifr6.ifra_prefixmask.sin6_len = sizeof ifr6.ifra_prefixmask; + memset(&ifr6.ifra_prefixmask.sin6_addr, 0xff, sc->prefix_length / 8); + uint8_t mask_size_mod = sc->prefix_length % 8; + if (mask_size_mod) + { + ifr6.ifra_prefixmask.sin6_addr.__u6_addr.__u6_addr8[sc->prefix_length / 8] = \ + ((1 << mask_size_mod) - 1) << (8 - mask_size_mod); + } + + ifr6.ifra_lifetime.ia6t_pltime = ifr6.ifra_lifetime.ia6t_vltime = ND6_INFINITE_LIFETIME; + + struct socket *so = NULL; + socreate(AF_INET6, &so, SOCK_DGRAM, 0, curthread->td_ucred, curthread); + + int i, ret; + for (i = 0; i < sc->nb_vip6; ++i) { + ifr6.ifra_addr.sin6_addr = sc->vip6[i]; + + ret = ifioctl(so, SIOCAIFADDR_IN6, (caddr_t)&ifr6, curthread); + if (ret < 0) { + printf("ff_veth_setvaddr6 ifioctl SIOCAIFADDR error\n"); + goto done; + } + } + +done: + sofree(so); + + return ret; +} #endif /* INET6 */ static int @@ -485,7 +612,7 @@ ff_veth_setup_interface(struct ff_veth_softc *sc, struct ff_port_cfg *cfg) return -1; } - //set ip + // Set IP int ret = ff_veth_setaddr(sc); if (ret != 0) { printf("ff_veth_setaddr failed\n"); @@ -495,6 +622,10 @@ ff_veth_setup_interface(struct ff_veth_softc *sc, struct ff_port_cfg *cfg) printf("ff_veth_set_gateway failed\n"); } + if (sc->nb_vip) { + ret = ff_veth_setvaddr(sc, cfg); + } + #ifdef INET6 // Set IPv6 if (cfg->addr6_str) { @@ -510,6 +641,10 @@ ff_veth_setup_interface(struct ff_veth_softc *sc, struct ff_port_cfg *cfg) } } } + + if (sc->nb_vip6) { + ret = ff_veth_setvaddr6(sc, cfg); + } #endif /* INET6 */ return (0); @@ -528,7 +663,11 @@ ff_veth_attach(struct ff_port_cfg *cfg) } memset(sc, 0, sizeof(struct ff_veth_softc)); - snprintf(sc->host_ifname, sizeof(sc->host_ifname), ff_IF_NAME, cfg->port_id); + if(cfg->ifname){ + snprintf(sc->host_ifname, sizeof(sc->host_ifname), "%s", cfg->ifname); + } else { + snprintf(sc->host_ifname, sizeof(sc->host_ifname), ff_IF_NAME, cfg->port_id); + } error = ff_veth_config(sc, cfg); if (0 != error) { @@ -573,7 +712,7 @@ ff_veth_softc_to_hostc(void *softc) /******************** * get next mbuf's addr, current mbuf's data and datalen. -* +* ********************/ int ff_next_mbuf(void **mbuf_bsd, void **data, unsigned *len) { @@ -605,7 +744,7 @@ void* ff_rte_frm_extcl(void* mbuf) bsd_mbuf->m_ext.ext_type == EXT_DISPOSABLE && bsd_mbuf->m_ext.ext_free == ff_mbuf_ext_free ) { return bsd_mbuf->m_ext.ext_arg1; } - else + else return NULL; }