/* * Copyright (C) 2017 THL A29 Limited, a Tencent company. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "ff_dpdk_if.h" #include "ff_dpdk_pcap.h" #include "ff_dpdk_kni.h" #include "ff_config.h" #include "ff_veth.h" #include "ff_host_interface.h" #define MEMPOOL_CACHE_SIZE 256 #define ARP_RING_SIZE 2048 /* * Configurable number of RX/TX ring descriptors */ #define RX_QUEUE_SIZE 512 #define TX_QUEUE_SIZE 256 #define MAX_PKT_BURST 32 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */ /* * Try to avoid TX buffering if we have at least MAX_TX_BURST packets to send. */ #define MAX_TX_BURST (MAX_PKT_BURST / 2) #define NB_SOCKETS 8 /* Configure how many packets ahead to prefetch, when reading packets */ #define PREFETCH_OFFSET 3 #define MAX_RX_QUEUE_PER_LCORE 16 #define MAX_TX_QUEUE_PER_PORT RTE_MAX_ETHPORTS #define MAX_RX_QUEUE_PER_PORT 128 #define BITS_PER_HEX 4 static int enable_kni; static int kni_accept; static struct rte_timer freebsd_clock; // Mellanox Linux's driver key static uint8_t default_rsskey_40bytes[40] = { 0xd1, 0x81, 0xc6, 0x2c, 0xf7, 0xf4, 0xdb, 0x5b, 0x19, 0x83, 0xa2, 0xfc, 0x94, 0x3e, 0x1a, 0xdb, 0xd9, 0x38, 0x9e, 0x6b, 0xd1, 0x03, 0x9c, 0x2c, 0xa7, 0x44, 0x99, 0xad, 0x59, 0x3d, 0x56, 0xd9, 0xf3, 0x25, 0x3c, 0x06, 0x2a, 0xdc, 0x1f, 0xfc }; static struct rte_eth_conf default_port_conf = { .rxmode = { .mq_mode = ETH_MQ_RX_RSS, .max_rx_pkt_len = ETHER_MAX_LEN, .split_hdr_size = 0, /**< hdr buf size */ .header_split = 0, /**< Header Split disabled */ .hw_ip_checksum = 0, /**< IP checksum offload disabled */ .hw_vlan_filter = 0, /**< VLAN filtering disabled */ .hw_vlan_strip = 0, /**< VLAN strip disabled. */ .hw_vlan_extend = 0, /**< Extended VLAN disabled. */ .jumbo_frame = 0, /**< Jumbo Frame Support disabled */ .hw_strip_crc = 0, /**< CRC stripped by hardware */ .enable_lro = 0, /**< LRO disabled */ }, .rx_adv_conf = { .rss_conf = { .rss_key = default_rsskey_40bytes, .rss_key_len = 40, .rss_hf = ETH_RSS_PROTO_MASK, }, }, .txmode = { .mq_mode = ETH_MQ_TX_NONE, }, }; struct mbuf_table { uint16_t len; struct rte_mbuf *m_table[MAX_PKT_BURST]; }; struct lcore_rx_queue { uint8_t port_id; uint8_t queue_id; } __rte_cache_aligned; struct lcore_conf { uint16_t proc_id; uint16_t nb_procs; uint16_t socket_id; uint16_t nb_rx_queue; uint16_t *lcore_proc; struct lcore_rx_queue rx_queue_list[MAX_RX_QUEUE_PER_LCORE]; uint16_t tx_queue_id[RTE_MAX_ETHPORTS]; struct mbuf_table tx_mbufs[RTE_MAX_ETHPORTS]; char *pcap[RTE_MAX_ETHPORTS]; } __rte_cache_aligned; static struct lcore_conf lcore_conf; static struct rte_mempool *pktmbuf_pool[NB_SOCKETS]; static struct rte_ring **arp_ring[RTE_MAX_LCORE]; struct ff_dpdk_if_context { void *sc; void *ifp; uint16_t port_id; }; static struct ff_dpdk_if_context *veth_ctx[RTE_MAX_ETHPORTS]; extern void ff_hardclock(void); static void freebsd_hardclock_job(__rte_unused struct rte_timer *timer, __rte_unused void *arg) { ff_hardclock(); } struct ff_dpdk_if_context * ff_dpdk_register_if(void *sc, void *ifp, struct ff_port_cfg *cfg) { struct ff_dpdk_if_context *ctx; ctx = calloc(1, sizeof(struct ff_dpdk_if_context)); if (ctx == NULL) return NULL; ctx->sc = sc; ctx->ifp = ifp; ctx->port_id = cfg->port_id; return ctx; } void ff_dpdk_deregister_if(struct ff_dpdk_if_context *ctx) { free(ctx); } static void check_all_ports_link_status(void) { #define CHECK_INTERVAL 100 /* 100ms */ #define MAX_CHECK_TIME 90 /* 9s (90 * 100ms) in total */ uint8_t portid, count, all_ports_up, print_flag = 0; struct rte_eth_link link; printf("\nChecking link status"); fflush(stdout); int i, nb_ports; nb_ports = ff_global_cfg.dpdk.nb_ports; for (count = 0; count <= MAX_CHECK_TIME; count++) { all_ports_up = 1; for (i = 0; i < nb_ports; i++) { uint8_t portid = ff_global_cfg.dpdk.port_cfgs[i].port_id; memset(&link, 0, sizeof(link)); rte_eth_link_get_nowait(portid, &link); /* print link status if flag set */ if (print_flag == 1) { if (link.link_status) { printf("Port %d Link Up - speed %u " "Mbps - %s\n", (int)portid, (unsigned)link.link_speed, (link.link_duplex == ETH_LINK_FULL_DUPLEX) ? ("full-duplex") : ("half-duplex\n")); } else { printf("Port %d Link Down\n", (int)portid); } continue; } /* clear all_ports_up flag if any link down */ if (link.link_status == 0) { all_ports_up = 0; break; } } /* after finally printing all link status, get out */ if (print_flag == 1) break; if (all_ports_up == 0) { printf("."); fflush(stdout); rte_delay_ms(CHECK_INTERVAL); } /* set the print_flag if all ports up or timeout */ if (all_ports_up == 1 || count == (MAX_CHECK_TIME - 1)) { print_flag = 1; printf("done\n"); } } } static int xdigit2val(unsigned char c) { int val; if (isdigit(c)) val = c - '0'; else if (isupper(c)) val = c - 'A' + 10; else val = c - 'a' + 10; return val; } static int parse_lcore_mask(const char *coremask, uint16_t *lcore_proc, uint16_t nb_procs) { int i, j, idx = 0; unsigned count = 0; char c; int val; if (coremask == NULL) return -1; /* Remove all blank characters ahead and after. * Remove 0x/0X if exists. */ while (isblank(*coremask)) coremask++; if (coremask[0] == '0' && ((coremask[1] == 'x') || (coremask[1] == 'X'))) coremask += 2; i = strlen(coremask); while ((i > 0) && isblank(coremask[i - 1])) i--; if (i == 0) return -1; for (i = i - 1; i >= 0 && idx < RTE_MAX_LCORE && count < nb_procs; i--) { c = coremask[i]; if (isxdigit(c) == 0) { return -1; } val = xdigit2val(c); for (j = 0; j < BITS_PER_HEX && idx < RTE_MAX_LCORE && count < nb_procs; j++, idx++) { if ((1 << j) & val) { if (!lcore_config[idx].detected) { RTE_LOG(ERR, EAL, "lcore %u unavailable\n", idx); return -1; } lcore_proc[count] = idx; count++; } } } for (; i >= 0; i--) if (coremask[i] != '0') return -1; if (count < nb_procs) return -1; return 0; } static int init_lcore_conf(void) { uint8_t nb_ports = rte_eth_dev_count(); if (nb_ports == 0) { rte_exit(EXIT_FAILURE, "No probed ethernet devices\n"); } lcore_conf.proc_id = ff_global_cfg.dpdk.proc_id; lcore_conf.nb_procs = ff_global_cfg.dpdk.nb_procs; lcore_conf.lcore_proc = rte_zmalloc(NULL, sizeof(uint16_t)*lcore_conf.nb_procs, 0); if (lcore_conf.lcore_proc == NULL) { rte_exit(EXIT_FAILURE, "rte_zmalloc lcore_proc failed\n"); } int ret = parse_lcore_mask(ff_global_cfg.dpdk.lcore_mask, lcore_conf.lcore_proc, lcore_conf.nb_procs); if (ret < 0) { rte_exit(EXIT_FAILURE, "parse_lcore_mask failed:%s\n", ff_global_cfg.dpdk.lcore_mask); } uint16_t socket_id = 0; if (ff_global_cfg.dpdk.numa_on) { socket_id = rte_lcore_to_socket_id(rte_lcore_id()); } lcore_conf.socket_id = socket_id; /* Currently, proc id 1:1 map to rx/tx queue id per port. */ uint8_t port_id, enabled_ports = 0; for (port_id = 0; port_id < nb_ports; port_id++) { if (ff_global_cfg.dpdk.port_mask && (ff_global_cfg.dpdk.port_mask & (1 << port_id)) == 0) { printf("\nSkipping disabled port %d\n", port_id); continue; } if (port_id >= ff_global_cfg.dpdk.nb_ports) { printf("\nSkipping non-configured port %d\n", port_id); break; } uint16_t nb_rx_queue = lcore_conf.nb_rx_queue; lcore_conf.rx_queue_list[nb_rx_queue].port_id = port_id; lcore_conf.rx_queue_list[nb_rx_queue].queue_id = lcore_conf.proc_id; lcore_conf.nb_rx_queue++; lcore_conf.tx_queue_id[port_id] = lcore_conf.proc_id; lcore_conf.pcap[port_id] = ff_global_cfg.dpdk.port_cfgs[enabled_ports].pcap; ff_global_cfg.dpdk.port_cfgs[enabled_ports].port_id = port_id; enabled_ports++; } ff_global_cfg.dpdk.nb_ports = enabled_ports; return 0; } static int init_mem_pool(void) { uint8_t nb_ports = ff_global_cfg.dpdk.nb_ports; uint32_t nb_lcores = ff_global_cfg.dpdk.nb_procs; uint32_t nb_tx_queue = nb_lcores; uint32_t nb_rx_queue = lcore_conf.nb_rx_queue * nb_lcores; unsigned nb_mbuf = RTE_MAX ( (nb_rx_queue*RX_QUEUE_SIZE + nb_ports*nb_lcores*MAX_PKT_BURST + nb_ports*nb_tx_queue*TX_QUEUE_SIZE + nb_lcores*MEMPOOL_CACHE_SIZE), (unsigned)8192); unsigned socketid = 0; uint16_t i, lcore_id; char s[64]; int numa_on = ff_global_cfg.dpdk.numa_on; for (i = 0; i < lcore_conf.nb_procs; i++) { lcore_id = lcore_conf.lcore_proc[i]; if (numa_on) { socketid = rte_lcore_to_socket_id(lcore_id); } if (socketid >= NB_SOCKETS) { rte_exit(EXIT_FAILURE, "Socket %d of lcore %u is out of range %d\n", socketid, i, NB_SOCKETS); } if (pktmbuf_pool[socketid] != NULL) { continue; } if (rte_eal_process_type() == RTE_PROC_PRIMARY) { snprintf(s, sizeof(s), "mbuf_pool_%d", socketid); pktmbuf_pool[socketid] = rte_pktmbuf_pool_create(s, nb_mbuf, MEMPOOL_CACHE_SIZE, 0, RTE_MBUF_DEFAULT_BUF_SIZE, socketid); } else { snprintf(s, sizeof(s), "mbuf_pool_%d", socketid); pktmbuf_pool[socketid] = rte_mempool_lookup(s); } if (pktmbuf_pool[socketid] == NULL) { rte_exit(EXIT_FAILURE, "Cannot create mbuf pool on socket %d\n", socketid); } else { printf("create mbuf pool on socket %d\n", socketid); } } return 0; } static int init_arp_ring(void) { int i, ret; char name_buf[RTE_RING_NAMESIZE]; int nb_procs = ff_global_cfg.dpdk.nb_procs; int proc_id = ff_global_cfg.dpdk.proc_id; /* Allocate arp ring ptr according to eth dev count. */ int nb_ports = rte_eth_dev_count(); for(i = 0; i < nb_procs; ++i) { snprintf(name_buf, RTE_RING_NAMESIZE, "ring_ptr_%d_%d", proc_id, i); arp_ring[i] = rte_zmalloc(name_buf, sizeof(struct rte_ring *) * nb_ports, RTE_CACHE_LINE_SIZE); if (arp_ring[i] == NULL) { rte_exit(EXIT_FAILURE, "rte_zmalloc(%s (struct rte_ring*)) " "failed\n", name_buf); } } unsigned socketid = lcore_conf.socket_id; /* Create ring according to ports actually being used. */ nb_ports = ff_global_cfg.dpdk.nb_ports; for (i = 0; i < nb_ports; i++) { uint8_t port_id = ff_global_cfg.dpdk.port_cfgs[i].port_id; for(i = 0; i < nb_procs; ++i) { snprintf(name_buf, RTE_RING_NAMESIZE, "ring_%d_%d", i, port_id); if (rte_eal_process_type() == RTE_PROC_PRIMARY) { arp_ring[i][port_id] = rte_ring_create(name_buf, ARP_RING_SIZE, socketid, RING_F_SC_DEQ); } else { arp_ring[i][port_id] = rte_ring_lookup(name_buf); } if (arp_ring[i][port_id] == NULL) rte_panic("create kni ring::%s failed!\n", name_buf); if (rte_ring_lookup(name_buf) != arp_ring[i][port_id]) rte_panic("lookup kni ring:%s failed!\n", name_buf); printf("create arp ring:%s success, %u ring entries are now free!\n", name_buf, rte_ring_free_count(arp_ring[i][port_id])); } } return 0; } static int init_kni(void) { int nb_ports = rte_eth_dev_count(); kni_accept = 0; if(strcasecmp(ff_global_cfg.kni.method, "accept") == 0) kni_accept = 1; ff_kni_init(nb_ports, ff_global_cfg.kni.tcp_port, ff_global_cfg.kni.udp_port); unsigned socket_id = lcore_conf.socket_id; struct rte_mempool *mbuf_pool = pktmbuf_pool[socket_id]; nb_ports = ff_global_cfg.dpdk.nb_ports; int i, ret; for (i = 0; i < nb_ports; i++) { uint8_t port_id = ff_global_cfg.dpdk.port_cfgs[i].port_id; ff_kni_alloc(port_id, socket_id, mbuf_pool); } return 0; } static int init_port_start(void) { int nb_ports = ff_global_cfg.dpdk.nb_ports; uint16_t nb_procs = ff_global_cfg.dpdk.nb_procs; unsigned socketid = rte_lcore_to_socket_id(rte_lcore_id()); struct rte_mempool *mbuf_pool = pktmbuf_pool[socketid]; uint16_t i; for (i = 0; i < nb_ports; i++) { uint8_t port_id = ff_global_cfg.dpdk.port_cfgs[i].port_id; struct rte_eth_dev_info dev_info; rte_eth_dev_info_get(port_id, &dev_info); if (nb_procs > dev_info.max_rx_queues) { rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_rx_queues[%d]\n", nb_procs, dev_info.max_rx_queues); } if (nb_procs > dev_info.max_tx_queues) { rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_tx_queues[%d]\n", nb_procs, dev_info.max_tx_queues); } struct ether_addr addr; rte_eth_macaddr_get(port_id, &addr); printf("Port %u MAC: %02" PRIx8 " %02" PRIx8 " %02" PRIx8 " %02" PRIx8 " %02" PRIx8 " %02" PRIx8 "\n", (unsigned)port_id, addr.addr_bytes[0], addr.addr_bytes[1], addr.addr_bytes[2], addr.addr_bytes[3], addr.addr_bytes[4], addr.addr_bytes[5]); rte_memcpy(ff_global_cfg.dpdk.port_cfgs[port_id].mac, addr.addr_bytes, ETHER_ADDR_LEN); if (rte_eal_process_type() != RTE_PROC_PRIMARY) { return 0; } /* * TODO: * Set port conf according to dev's capability. */ struct rte_eth_conf port_conf = default_port_conf; port_conf.rxmode.hw_vlan_strip = ff_global_cfg.dpdk.port_cfgs[port_id].vlanstrip; /* Currently, proc id 1:1 map to queue id per port. */ int ret = rte_eth_dev_configure(port_id, nb_procs, nb_procs, &port_conf); if (ret != 0) { return ret; } uint16_t q; for (q = 0; q < nb_procs; q++) { ret = rte_eth_tx_queue_setup(port_id, q, TX_QUEUE_SIZE, socketid, &dev_info.default_txconf); if (ret < 0) { return ret; } ret = rte_eth_rx_queue_setup(port_id, q, RX_QUEUE_SIZE, socketid, &dev_info.default_rxconf, mbuf_pool); if (ret < 0) { return ret; } } ret = rte_eth_dev_start(port_id); if (ret < 0) { return ret; } /* Enable RX in promiscuous mode for the Ethernet device. */ if (ff_global_cfg.dpdk.promiscuous) { rte_eth_promiscuous_enable(port_id); ret = rte_eth_promiscuous_get(port_id); if (ret == 1) { printf("set port %u to promiscuous mode ok\n", port_id); } else { printf("set port %u to promiscuous mode error\n", port_id); } } /* Enable pcap dump */ if (ff_global_cfg.dpdk.port_cfgs[port_id].pcap) { ff_enable_pcap(ff_global_cfg.dpdk.port_cfgs[port_id].pcap); } } return 0; } static int init_freebsd_clock(void) { rte_timer_subsystem_init(); uint64_t hz = rte_get_timer_hz(); uint64_t intrs = MS_PER_S/ff_global_cfg.freebsd.hz; uint64_t tsc = (hz + MS_PER_S - 1) / MS_PER_S*intrs; rte_timer_init(&freebsd_clock); rte_timer_reset(&freebsd_clock, tsc, PERIODICAL, rte_lcore_id(), &freebsd_hardclock_job, NULL); return 0; } int ff_dpdk_init(int argc, char **argv) { if (ff_global_cfg.dpdk.nb_procs < 1 || ff_global_cfg.dpdk.nb_procs > RTE_MAX_LCORE || ff_global_cfg.dpdk.proc_id >= ff_global_cfg.dpdk.nb_procs || ff_global_cfg.dpdk.nb_procs < 0) { printf("param num_procs[%d] or proc_id[%d] error!\n", ff_global_cfg.dpdk.nb_procs, ff_global_cfg.dpdk.proc_id); exit(1); } int ret = rte_eal_init(argc, argv); if (ret < 0) { rte_exit(EXIT_FAILURE, "Error with EAL initialization\n"); } init_lcore_conf(); init_mem_pool(); init_arp_ring(); enable_kni = ff_global_cfg.kni.enable; if (enable_kni) { init_kni(); } ret = init_port_start(); if (ret < 0) { rte_exit(EXIT_FAILURE, "init_port_start failed\n"); } check_all_ports_link_status(); init_freebsd_clock(); return 0; } static void ff_veth_input(void *ifp, struct rte_mbuf *pkt) { void *data = rte_pktmbuf_mtod(pkt, void*); uint16_t len = rte_pktmbuf_data_len(pkt); void *hdr = ff_mbuf_gethdr(pkt, pkt->pkt_len, data, len); if (hdr == NULL) { rte_pktmbuf_free(pkt); return; } pkt = pkt->next; void *prev = hdr; while(pkt != NULL) { data = rte_pktmbuf_mtod(pkt, void*); len = rte_pktmbuf_data_len(pkt); void *mb = ff_mbuf_get(prev, data, len); if (mb == NULL) { ff_mbuf_free(hdr); return; } pkt = pkt->next; prev = mb; } ff_veth_process_packet(ifp, hdr); } static enum FilterReturn protocol_filter(const void *data, uint16_t len) { if(len < sizeof(struct ether_hdr)) return FILTER_UNKNOWN; const struct ether_hdr *hdr; hdr = (const struct ether_hdr *)data; if(ntohs(hdr->ether_type) == ETHER_TYPE_ARP) return FILTER_ARP; if (!enable_kni) { return FILTER_UNKNOWN; } if(ntohs(hdr->ether_type) != ETHER_TYPE_IPv4) return FILTER_UNKNOWN; return ff_kni_proto_filter(data + sizeof(struct ether_hdr), len - sizeof(struct ether_hdr)); } static inline void process_packets(uint8_t port_id, uint16_t queue_id, struct rte_mbuf **bufs, uint16_t count, void *ifp, int pkts_from_ring) { struct lcore_conf *qconf = &lcore_conf; uint16_t i; for (i = 0; i < count; i++) { struct rte_mbuf *rtem = bufs[i]; if (unlikely(qconf->pcap[port_id] != NULL)) { ff_dump_packets(qconf->pcap[port_id], rtem); } void *data = rte_pktmbuf_mtod(rtem, void*); uint16_t len = rte_pktmbuf_data_len(rtem); enum FilterReturn filter = protocol_filter(data, len); if (filter == FILTER_ARP) { struct rte_mempool *mbuf_pool; struct rte_mbuf *mbuf_clone; if (pkts_from_ring == 0) { uint16_t i; for(i = 0; i < qconf->nb_procs; ++i) { if(i == queue_id) continue; mbuf_pool = pktmbuf_pool[rte_lcore_to_socket_id(qconf->lcore_proc[i])]; mbuf_clone = rte_pktmbuf_clone(rtem, mbuf_pool); if(mbuf_clone) { int ret = rte_ring_enqueue(arp_ring[i][port_id], mbuf_clone); if (ret < 0) rte_pktmbuf_free(mbuf_clone); } } } if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) { mbuf_pool = pktmbuf_pool[qconf->socket_id]; mbuf_clone = rte_pktmbuf_clone(rtem, mbuf_pool); if(mbuf_clone) { ff_kni_enqueue(port_id, rtem); } } ff_veth_input(ifp, rtem); } else if (enable_kni && ((filter == FILTER_KNI && kni_accept) || (filter == FILTER_UNKNOWN && !kni_accept)) ) { ff_kni_enqueue(port_id, rtem); } else { ff_veth_input(ifp, rtem); } } } static inline int process_arp_ring(uint8_t port_id, uint16_t queue_id, struct rte_mbuf **pkts_burst, void *ifp) { /* read packet from ring buf and to process */ uint16_t nb_tx; nb_tx = rte_ring_dequeue_burst(arp_ring[queue_id][port_id], (void **)pkts_burst, MAX_PKT_BURST); if(nb_tx > 0) { process_packets(port_id, queue_id, pkts_burst, nb_tx, ifp, 1); } return 0; } /* Send burst of packets on an output interface */ static inline int send_burst(struct lcore_conf *qconf, uint16_t n, uint8_t port) { struct rte_mbuf **m_table; int ret; uint16_t queueid; queueid = qconf->tx_queue_id[port]; m_table = (struct rte_mbuf **)qconf->tx_mbufs[port].m_table; if (unlikely(qconf->pcap[port] != NULL)) { uint16_t i; for (i = 0; i < n; i++) { ff_dump_packets(qconf->pcap[port], m_table[i]); } } ret = rte_eth_tx_burst(port, queueid, m_table, n); if (unlikely(ret < n)) { do { rte_pktmbuf_free(m_table[ret]); } while (++ret < n); } return 0; } /* Enqueue a single packet, and send burst if queue is filled */ static inline int send_single_packet(struct rte_mbuf *m, uint8_t port) { uint16_t len; struct lcore_conf *qconf; qconf = &lcore_conf; len = qconf->tx_mbufs[port].len; qconf->tx_mbufs[port].m_table[len] = m; len++; /* enough pkts to be sent */ if (unlikely(len == MAX_PKT_BURST)) { send_burst(qconf, MAX_PKT_BURST, port); len = 0; } qconf->tx_mbufs[port].len = len; return 0; } int ff_dpdk_if_send(struct ff_dpdk_if_context *ctx, void *m, int total) { struct rte_mempool *mbuf_pool = pktmbuf_pool[lcore_conf.socket_id]; struct rte_mbuf *head = rte_pktmbuf_alloc(mbuf_pool); if (head == NULL) { ff_mbuf_free(m); return -1; } head->pkt_len = total; int off = 0; struct rte_mbuf *cur = head, *prev = NULL; while(total > 0) { if (cur == NULL) { struct rte_mbuf *cur = rte_pktmbuf_alloc(mbuf_pool); if (cur == NULL) { rte_pktmbuf_free(head); ff_mbuf_free(m); return -1; } } void *data = rte_pktmbuf_mtod(cur, void*); int len = total > RTE_MBUF_DEFAULT_DATAROOM ? RTE_MBUF_DEFAULT_DATAROOM : total; int ret = ff_mbuf_copydata(m, data, off, len); if (ret < 0) { rte_pktmbuf_free(head); ff_mbuf_free(m); return -1; } if (prev == NULL) { prev = cur; } else { prev->next = cur; } cur->data_len = len; off += len; total -= len; head->nb_segs++; } /* * FIXME: set offload flags according to mbuf.pkthdr; */ head->ol_flags = 0; head->vlan_tci = 0; ff_mbuf_free(m); return send_single_packet(head, ctx->port_id); } static int main_loop(void *arg) { struct loop_routine *lr = (struct loop_routine *)arg; struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; unsigned lcore_id; uint64_t prev_tsc, diff_tsc, cur_tsc; int i, j, nb_rx; uint8_t port_id, queue_id; struct lcore_conf *qconf; const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US; void *ifp; prev_tsc = 0; lcore_id = rte_lcore_id(); qconf = &lcore_conf; if (qconf->nb_rx_queue == 0) { printf("lcore %u has nothing to do\n", lcore_id); return 0; } while (1) { cur_tsc = rte_rdtsc(); if (unlikely(freebsd_clock.expire < cur_tsc)) { rte_timer_manage(); } /* * TX burst queue drain */ diff_tsc = cur_tsc - prev_tsc; if (unlikely(diff_tsc > drain_tsc)) { /* * This could be optimized (use queueid instead of * portid), but it is not called so often */ for (port_id = 0; port_id < RTE_MAX_ETHPORTS; port_id++) { if (qconf->tx_mbufs[port_id].len == 0) continue; send_burst(qconf, qconf->tx_mbufs[port_id].len, port_id); qconf->tx_mbufs[port_id].len = 0; } prev_tsc = cur_tsc; } /* * Read packet from RX queues */ for (i = 0; i < qconf->nb_rx_queue; ++i) { port_id = qconf->rx_queue_list[i].port_id; queue_id = qconf->rx_queue_list[i].queue_id; ifp = veth_ctx[port_id]->ifp; if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) { ff_kni_process(port_id, queue_id, pkts_burst, MAX_PKT_BURST); } process_arp_ring(port_id, queue_id, pkts_burst, ifp); nb_rx = rte_eth_rx_burst(port_id, queue_id, pkts_burst, MAX_PKT_BURST); if (nb_rx == 0) continue; /* Prefetch first packets */ for (j = 0; j < PREFETCH_OFFSET && j < nb_rx; j++) { rte_prefetch0(rte_pktmbuf_mtod( pkts_burst[j], void *)); } /* Prefetch and handle already prefetched packets */ for (j = 0; j < (nb_rx - PREFETCH_OFFSET); j++) { rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[ j + PREFETCH_OFFSET], void *)); process_packets(port_id, queue_id, &pkts_burst[j], 1, ifp, 0); } /* Handle remaining prefetched packets */ for (; j < nb_rx; j++) { process_packets(port_id, queue_id, &pkts_burst[j], 1, ifp, 0); } } if (likely(lr->loop != NULL)) { lr->loop(lr->arg); } } } int ff_dpdk_if_up(void) { int nb_ports = ff_global_cfg.dpdk.nb_ports; int i; for (i = 0; i < nb_ports; i++) { uint8_t port_id = ff_global_cfg.dpdk.port_cfgs[i].port_id; veth_ctx[port_id] = ff_veth_attach(ff_global_cfg.dpdk.port_cfgs + i); if (veth_ctx[port_id] == NULL) { rte_exit(EXIT_FAILURE, "ff_veth_attach failed"); } } return 0; } void ff_dpdk_run(loop_func_t loop, void *arg) { struct loop_routine *lr = malloc(sizeof(struct loop_routine)); lr->loop = loop; lr->arg = arg; rte_eal_mp_remote_launch(main_loop, lr, CALL_MASTER); rte_eal_mp_wait_lcore(); free(lr); } void ff_dpdk_pktmbuf_free(void *m) { rte_pktmbuf_free((struct rte_mbuf *)m); } static uint32_t toeplitz_hash(unsigned keylen, const uint8_t *key, unsigned datalen, const uint8_t *data) { uint32_t hash = 0, v; u_int i, b; /* XXXRW: Perhaps an assertion about key length vs. data length? */ v = (key[0]<<24) + (key[1]<<16) + (key[2] <<8) + key[3]; for (i = 0; i < datalen; i++) { for (b = 0; b < 8; b++) { if (data[i] & (1<<(7-b))) hash ^= v; v <<= 1; if ((i + 4) < keylen && (key[i+4] & (1<<(7-b)))) v |= 1; } } return (hash); } int ff_rss_check(uint32_t saddr, uint32_t daddr, uint16_t sport, uint16_t dport) { struct lcore_conf *qconf = &lcore_conf; if (qconf->nb_procs == 1) { return 1; } uint8_t data[sizeof(saddr) + sizeof(daddr) + sizeof(sport) + sizeof(dport)]; unsigned datalen = 0; bcopy(&saddr, &data[datalen], sizeof(saddr)); datalen += sizeof(saddr); bcopy(&daddr, &data[datalen], sizeof(daddr)); datalen += sizeof(daddr); bcopy(&sport, &data[datalen], sizeof(sport)); datalen += sizeof(sport); bcopy(&dport, &data[datalen], sizeof(dport)); datalen += sizeof(dport); uint32_t hash = toeplitz_hash(sizeof(default_rsskey_40bytes), default_rsskey_40bytes, datalen, data); return (hash % qconf->nb_procs) == qconf->proc_id; }