From 5bf882b459be555a487ad194a2767c4251200665 Mon Sep 17 00:00:00 2001 From: 10077240 Date: Mon, 1 Apr 2019 14:54:36 +0800 Subject: [PATCH] modify according to MR.Wang --- lib/Makefile | 14 +- lib/ff_config.c | 2 +- lib/ff_dpdk_if.c | 549 ++-------------------------------------- lib/ff_host_interface.c | 10 +- lib/ff_memory.c | 480 +++++++++++++++++++++++++++++++++++ lib/ff_memory.h | 119 +++++++++ lib/ff_veth.c | 5 +- 7 files changed, 642 insertions(+), 537 deletions(-) create mode 100644 lib/ff_memory.c create mode 100644 lib/ff_memory.h diff --git a/lib/Makefile b/lib/Makefile index 2873b87e2..5557bfba4 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -22,6 +22,8 @@ HOST_OS:=$(shell uname -s) FF_KNI=1 #FF_NETGRAPH=1 #FF_IPFW=1 +#FF_USE_PAGE_ARRAY=1 + include ${TOPDIR}/mk/kern.pre.mk @@ -45,7 +47,6 @@ DPDK_CFLAGS= -Wall -Werror -include ${FF_DPDK}/include/rte_config.h DPDK_CFLAGS+= -march=native -DRTE_MACHINE_CPUFLAG_SSE -DRTE_MACHINE_CPUFLAG_SSE2 -DRTE_MACHINE_CPUFLAG_SSE3 DPDK_CFLAGS+= -DRTE_MACHINE_CPUFLAG_SSSE3 -DRTE_MACHINE_CPUFLAG_SSE4_1 -DRTE_MACHINE_CPUFLAG_SSE4_2 DPDK_CFLAGS+= -DRTE_COMPILE_TIME_CPUFLAGS=RTE_CPUFLAG_SSE,RTE_CPUFLAG_SSE2,RTE_CPUFLAG_SSE3,RTE_CPUFLAG_SSSE3,RTE_CPUFLAG_SSE4_1,RTE_CPUFLAG_SSE4_2 -#DPDK_CFLAGS+= -D_USE_PAGE_ARRAY_ DPDK_CFLAGS+= -I${FF_DPDK}/include KERNPREINCLUDES:= ${INCLUDES} @@ -76,6 +77,10 @@ ifdef FF_IPFW HOST_CFLAGS+= -DFF_IPFW endif +ifdef FF_USE_PAGE_ARRAY +HOST_CFLAGS+= -DFF_USE_PAGE_ARRAY +endif + HOST_C= ${CC} -c $(HOST_CFLAGS) ${HOST_INCLUDES} ${WERROR} ${PROF} $< @@ -194,13 +199,18 @@ FF_HOST_SRCS+= \ ff_dpdk_if.c \ ff_dpdk_pcap.c \ ff_epoll.c \ - ff_init.c + ff_init.c ifdef FF_KNI FF_HOST_SRCS+= \ ff_dpdk_kni.c endif +ifdef FF_USE_PAGE_ARRAY +FF_HOST_SRCS+= \ + ff_memory.c +endif + ifdef FF_IPSEC CRYPTO_ASM_SRCS+= \ aesencdec_${MACHINE_CPUARCH}.S \ diff --git a/lib/ff_config.c b/lib/ff_config.c index 6802ada25..8d8f21296 100644 --- a/lib/ff_config.c +++ b/lib/ff_config.c @@ -715,7 +715,7 @@ ff_default_config(struct ff_config *cfg) cfg->freebsd.hz = 100; cfg->freebsd.physmem = 1048576*256; cfg->freebsd.fd_reserve = 0; - cfg->freebsd.mem_size =256; + cfg->freebsd.mem_size = 256; } int diff --git a/lib/ff_dpdk_if.c b/lib/ff_dpdk_if.c index 7646ed7b2..f433b3662 100644 --- a/lib/ff_dpdk_if.c +++ b/lib/ff_dpdk_if.c @@ -61,41 +61,7 @@ #include "ff_host_interface.h" #include "ff_msg.h" #include "ff_api.h" - -#define MEMPOOL_CACHE_SIZE 256 - -#define DISPATCH_RING_SIZE 2048 - -#define MSG_RING_SIZE 32 - -#define PAGE_SIZE 4096 -#define PAGE_SHIFT 12 -#define PAGE_MASK (PAGE_SIZE - 1) -#define trunc_page(x) ((x) & ~PAGE_MASK) -#define round_page(x) (((x) + PAGE_MASK) & ~PAGE_MASK) - -/* - * Configurable number of RX/TX ring descriptors - */ -#define RX_QUEUE_SIZE 512 -#define TX_QUEUE_SIZE 512 - -#define MAX_PKT_BURST 32 -#define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */ - -/* - * Try to avoid TX buffering if we have at least MAX_TX_BURST packets to send. - */ -#define MAX_TX_BURST (MAX_PKT_BURST / 2) - -#define NB_SOCKETS 8 - -/* Configure how many packets ahead to prefetch, when reading packets */ -#define PREFETCH_OFFSET 3 - -#define MAX_RX_QUEUE_PER_LCORE 16 -#define MAX_TX_QUEUE_PER_PORT RTE_MAX_ETHPORTS -#define MAX_RX_QUEUE_PER_PORT 128 +#include "ff_memory.h" #ifdef FF_KNI #define KNI_MBUF_MAX 2048 @@ -146,38 +112,9 @@ static struct rte_eth_conf default_port_conf = { }, }; -struct mbuf_table { - uint16_t len; - struct rte_mbuf *m_table[MAX_PKT_BURST]; -#ifdef _USE_PAGE_ARRAY_ - void* bsd_m_table[MAX_PKT_BURST]; // save bsd mbuf address which will be freed. -#endif +struct lcore_conf lcore_conf; -}; - -struct lcore_rx_queue { - uint16_t port_id; - uint16_t queue_id; -} __rte_cache_aligned; - -struct lcore_conf { - uint16_t proc_id; - uint16_t socket_id; - uint16_t nb_queue_list[RTE_MAX_ETHPORTS]; - struct ff_port_cfg *port_cfgs; - - uint16_t nb_rx_queue; - struct lcore_rx_queue rx_queue_list[MAX_RX_QUEUE_PER_LCORE]; - uint16_t nb_tx_port; - uint16_t tx_port_id[RTE_MAX_ETHPORTS]; - uint16_t tx_queue_id[RTE_MAX_ETHPORTS]; - struct mbuf_table tx_mbufs[RTE_MAX_ETHPORTS]; - char *pcap[RTE_MAX_ETHPORTS]; -} __rte_cache_aligned; - -static struct lcore_conf lcore_conf; - -static struct rte_mempool *pktmbuf_pool[NB_SOCKETS]; +struct rte_mempool *pktmbuf_pool[NB_SOCKETS]; static struct rte_ring **dispatch_ring[RTE_MAX_ETHPORTS]; static dispatch_func_t packet_dispatcher; @@ -193,63 +130,10 @@ struct ff_msg_ring { static struct ff_msg_ring msg_ring[RTE_MAX_LCORE]; static struct rte_mempool *message_pool; - -struct ff_dpdk_if_context { - void *sc; - void *ifp; - uint16_t port_id; - struct ff_hw_features hw_features; -} __rte_cache_aligned; - static struct ff_dpdk_if_context *veth_ctx[RTE_MAX_ETHPORTS]; static struct ff_top_args ff_top_status; static struct ff_traffic_args ff_traffic; - -#ifdef _USE_PAGE_ARRAY_ - -// ff_ref_pool allocate rte_mbuf without data space, which data point to bsd mbuf's data address. -static struct rte_mempool *ff_ref_pool[NB_SOCKETS]; - -// mbuf_txring save mbuf which had bursted into NIC, m_tables has same length with NIC dev's sw_ring. -// Then when txring.m_table[x] is reused, the packet in txring.m_table[x] had been transmited by NIC. -// that means the mbuf can be freed safely. -struct mbuf_txring{ - void* m_table[TX_QUEUE_SIZE]; - uint16_t head; // next available element. -}; -#define Head_INC(h) {\ - if ( ++h >= TX_QUEUE_SIZE ) \ - h = 0;\ - }; - -#define Head_DEC(h) do{\ - if ( --h < 0 ) \ - h = TX_QUEUE_SIZE-1;\ - }while(0); - -// bsd mbuf was moved into nic_tx_ring from tmp_tables, after rte_eth_tx_burst() succeed. -static struct mbuf_txring nic_tx_ring[RTE_MAX_ETHPORTS]; -static inline int ff_txring_enqueue(struct mbuf_txring* q, void *p, int seg_num); -static inline void ff_txring_init(struct mbuf_txring* r, uint32_t len); -static int ff_dpdk_if_send_ex(struct ff_dpdk_if_context *ctx, void *m, int total); -static int ff_mmap_init(); - -typedef struct _list_manager_s -{ - uint64_t *ele; - int size; - //int FreeNum; - int top; -}StackList_t; - -static StackList_t ff_mpage_ctl = {0}; -static uint64_t ff_page_start = NULL, ff_page_end = NULL; -static phys_addr_t* ff_mpage_phy = NULL; -static inline void* StkList_pop(StackList_t *p); -static inline int StkList_push(StackList_t * p, uint64_t val); -#endif - extern void ff_hardclock(void); static void @@ -466,21 +350,13 @@ init_mem_pool(void) printf("create mbuf pool on socket %d\n", socketid); } -#ifdef _USE_PAGE_ARRAY_ - if (ff_ref_pool[socketid] != NULL) { - continue; - } - nb_mbuf = RTE_MAX ( +#ifdef FF_USE_PAGE_ARRAY + nb_mbuf = RTE_MAX ( nb_ports*nb_lcores*MAX_PKT_BURST + nb_ports*nb_tx_queue*TX_QUEUE_SIZE + nb_lcores*MEMPOOL_CACHE_SIZE, (unsigned)4096); - snprintf(s, sizeof(s), "ff_ref_pool_%d", socketid); - if (rte_eal_process_type() == RTE_PROC_PRIMARY) { - ff_ref_pool[socketid] = rte_pktmbuf_pool_create(s, nb_mbuf, MEMPOOL_CACHE_SIZE, 0, 0, socketid); - } else { - ff_ref_pool[socketid] = rte_mempool_lookup(s); - } + ff_init_ref_pool(nb_mbuf, socketid); #endif } @@ -917,7 +793,7 @@ ff_dpdk_init(int argc, char **argv) } #endif -#ifdef _USE_PAGE_ARRAY_ +#ifdef FF_USE_PAGE_ARRAY ff_mmap_init(); #endif @@ -1361,17 +1237,17 @@ send_burst(struct lcore_conf *qconf, uint16_t n, uint8_t port) uint16_t i; for (i = 0; i < ret; i++) { ff_traffic.tx_bytes += rte_pktmbuf_pkt_len(m_table[i]); -#ifdef _USE_PAGE_ARRAY_ +#ifdef FF_USE_PAGE_ARRAY if (qconf->tx_mbufs[port].bsd_m_table[i]) - ff_txring_enqueue(&nic_tx_ring[port], qconf->tx_mbufs[port].bsd_m_table[i], m_table[i]->nb_segs); -#endif + ff_enq_tx_bsdmbuf(port, qconf->tx_mbufs[port].bsd_m_table[i], m_table[i]->nb_segs); +#endif } if (unlikely(ret < n)) { do { rte_pktmbuf_free(m_table[ret]); -#ifdef _USE_PAGE_ARRAY_ - if ( qconf->tx_mbufs[port].bsd_m_table[ret] ) - ff_mbuf_free(qconf->tx_mbufs[port].bsd_m_table[ret]); +#ifdef FF_USE_PAGE_ARRAY + if ( qconf->tx_mbufs[port].bsd_m_table[ret] ) + ff_mbuf_free(qconf->tx_mbufs[port].bsd_m_table[ret]); #endif } while (++ret < n); } @@ -1404,8 +1280,17 @@ int ff_dpdk_if_send(struct ff_dpdk_if_context *ctx, void *m, int total) { -#ifdef _USE_PAGE_ARRAY_ - return ff_dpdk_if_send_ex(ctx, m,total); +#ifdef FF_USE_PAGE_ARRAY + struct lcore_conf *qconf = &lcore_conf; + int len = 0; + + len = ff_if_send_onepkt(ctx, m,total); + if (unlikely(len == MAX_PKT_BURST)) { + send_burst(qconf, MAX_PKT_BURST, ctx->port_id); + len = 0; + } + qconf->tx_mbufs[ctx->port_id].len = len; + return 0; #endif struct rte_mempool *mbuf_pool = pktmbuf_pool[lcore_conf.socket_id]; struct rte_mbuf *head = rte_pktmbuf_alloc(mbuf_pool); @@ -1758,390 +1643,4 @@ ff_get_tsc_ns() return ((double)cur_tsc/(double)hz) * NS_PER_S; } -#ifdef _USE_PAGE_ARRAY_ -static int StkList_init(StackList_t*p, int size) -{ - int i = 0; - - if (p==NULL || size<=0) - { - return -1; - } - p->size = size; - p->top = 0; - if ( posix_memalign((void**)&p->ele, sizeof(uint64_t), sizeof(uint64_t)*size) != 0) - return -2; - - return 0; -} - -static inline void* StkList_pop(StackList_t *p) -{ - int head = 0; - - if(p==NULL) - return NULL; - - if (p->top > 0 ) - { - return (void*)p->ele[--p->top]; - } - else - return NULL; -} - -//id: the id of element to be freed. -//return code: -1: faile; >=0:OK. -static inline int StkList_push(StackList_t *p, const uint64_t val) -{ - int tail = 0; - - if(p==NULL) - return -1; - if (p->top < p->size) - { - p->ele[p->top++] = val; - return 0; - } - else - return -1; -} - -static int StkList_Size(StackList_t * p) -{ - return p->size; -} - -// set (void*) to rte_mbuf's priv_data. -static inline int ff_mbuf_set_uint64(struct rte_mbuf* p, uint64_t data) -{ - if (rte_pktmbuf_priv_size(p->pool) >= sizeof(uint64_t)) - *((uint64_t*)(p+1)) = data; - return 0; -} - -/************************* -* if mbuf has num segment in all, Dev's sw_ring will use num descriptions. ff_txring also use num segments as below: -* <--- num-1 ---->|ptr| head | -* ---------------------------------------------- -* | 0 | 0 | ..............| 0 | p | XXX | -*----------------------------------------------- -*************************/ -static inline int ff_txring_enqueue(struct mbuf_txring* q, void *p, int seg_num) -{ - int i = 0; - for( i=0; im_table[q->head] ) - { - ff_mbuf_free(q->m_table[q->head]); - q->m_table[q->head] = NULL; - } - Head_INC(q->head); - } - if ( q->m_table[q->head] ) - ff_mbuf_free(q->m_table[q->head]); - q->m_table[q->head] = p; - Head_INC(q->head); - - return 0; -} - -// pop out from head-1 . -static inline int ff_txring_pop(struct mbuf_txring* q, int num) -{ - int i = 0; - - for (i=0; ihead); - if ( (i==0 && q->m_table[q->head]==NULL) || (i>0 && q->m_table[q->head]!=NULL) ) - { - rte_panic("ff_txring_pop fatal error!"); - } - if ( q->m_table[q->head] != NULL ) - { - ff_mbuf_free(q->m_table[q->head]); - q->m_table[q->head] = NULL; - } - } -} - -static inline void ff_txring_init(struct mbuf_txring* q, uint32_t num) -{ - memset(q, 0, sizeof(struct mbuf_txring)*num); -} - -static int ff_mmap_init() -{ - int err = 0; - int i = 0; - uint64_t virt_addr = NULL; - phys_addr_t phys_addr = 0; - uint64_t bsd_memsz = (ff_global_cfg.freebsd.mem_size << 20); - unsigned int bsd_pagesz = 0; - - ff_page_start = (uint64_t)mmap( NULL, bsd_memsz, PROT_READ | PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_POPULATE, -1, 0); - if (ff_page_start == (uint64_t)-1) - { - rte_panic("ff_mmap_init get ff_page_start failed, err=%d.\n", errno); - return -1; - } - - if ( mlock((void*)ff_page_start, bsd_memsz)<0 ) - { - rte_panic("mlock failed, err=%d.\n", errno); - return -1; - } - ff_page_end = ff_page_start + bsd_memsz; - - rte_log(RTE_LOG_INFO, RTE_LOGTYPE_USER1, "ff_mmap_init mmap %d pages, %d MB.\n", bsd_pagesz, ff_global_cfg.freebsd.mem_size); - printf("ff_mmap_init mem[0x%lx:0x%lx]\n", ff_page_start, ff_page_end); - - bsd_pagesz = (bsd_memsz>>12); - if(posix_memalign((void**)&ff_mpage_phy, sizeof(phys_addr_t), bsd_pagesz*sizeof(phys_addr_t))!=0) - { - rte_panic("posix_memalign get ff_mpage_phy failed, err=%d.\n", errno); - return -1; - } - - StkList_init(&ff_mpage_ctl, bsd_pagesz); - - for (i=0; i ff_page_start && virtaddr < ff_page_end ); -} - -/* - * Get physical address of any mapped virtual address in the current process. - */ -static inline uint64_t ff_mem_virt2phy(const void* virtaddr) -{ - uint64_t addr = 0; - uint32_t pages = 0; - - pages = (((uint64_t)virtaddr - (uint64_t)ff_page_start)>>PAGE_SHIFT); - if (pages >= StkList_Size(&ff_mpage_ctl)) - { - rte_panic("ff_mbuf_virt2phy get invalid pages %d.", pages); - return -1; - } - - addr = ff_mpage_phy[pages] + ((const uint64_t)virtaddr & PAGE_MASK); - return addr; -} - -void* ff_mem_get_page() -{ - return (void*)StkList_pop(&ff_mpage_ctl); -} - -int ff_mem_free_addr(void* p) -{ - StkList_push(&ff_mpage_ctl, (const uint64_t)p); - return 0; -} - -static inline void ff_offload_set(struct ff_dpdk_if_context *ctx, void* m, struct rte_mbuf *head) -{ - void* data = NULL; - struct ff_tx_offload offload = {0}; - - ff_mbuf_tx_offload(m, &offload); - data = rte_pktmbuf_mtod(head, void*); - - if (offload.ip_csum) { - /* ipv6 not supported yet */ - struct ipv4_hdr *iph; - int iph_len; - iph = (struct ipv4_hdr *)(data + ETHER_HDR_LEN); - iph_len = (iph->version_ihl & 0x0f) << 2; - - head->ol_flags |= PKT_TX_IP_CKSUM | PKT_TX_IPV4; - head->l2_len = ETHER_HDR_LEN; - head->l3_len = iph_len; - } - - if (ctx->hw_features.tx_csum_l4) { - struct ipv4_hdr *iph; - int iph_len; - iph = (struct ipv4_hdr *)(data + ETHER_HDR_LEN); - iph_len = (iph->version_ihl & 0x0f) << 2; - - if (offload.tcp_csum) { - head->ol_flags |= PKT_TX_TCP_CKSUM; - head->l2_len = ETHER_HDR_LEN; - head->l3_len = iph_len; - } - - /* - * TCP segmentation offload. - * - * - set the PKT_TX_TCP_SEG flag in mbuf->ol_flags (this flag - * implies PKT_TX_TCP_CKSUM) - * - set the flag PKT_TX_IPV4 or PKT_TX_IPV6 - * - if it's IPv4, set the PKT_TX_IP_CKSUM flag and - * write the IP checksum to 0 in the packet - * - fill the mbuf offload information: l2_len, - * l3_len, l4_len, tso_segsz - * - calculate the pseudo header checksum without taking ip_len - * in account, and set it in the TCP header. Refer to - * rte_ipv4_phdr_cksum() and rte_ipv6_phdr_cksum() that can be - * used as helpers. - */ - if (offload.tso_seg_size) { - struct tcp_hdr *tcph; - int tcph_len; - tcph = (struct tcp_hdr *)((char *)iph + iph_len); - tcph_len = (tcph->data_off & 0xf0) >> 2; - tcph->cksum = rte_ipv4_phdr_cksum(iph, PKT_TX_TCP_SEG); - - head->ol_flags |= PKT_TX_TCP_SEG; - head->l4_len = tcph_len; - head->tso_segsz = offload.tso_seg_size; - } - - if (offload.udp_csum) { - head->ol_flags |= PKT_TX_UDP_CKSUM; - head->l2_len = ETHER_HDR_LEN; - head->l3_len = iph_len; - } - } -} - -// create rte_buf refer to data which is transmit from bsd stack by EXT_CLUSTER. -static inline struct rte_mbuf* ff_extcl_to_rte(void* m) -{ - struct rte_mempool *mbuf_pool = pktmbuf_pool[lcore_conf.socket_id]; - struct rte_mbuf* src_mbuf = NULL; - struct rte_mbuf *p_head = NULL; - - src_mbuf = (struct rte_mbuf*)ff_rte_frm_extcl(m); - if ( NULL==src_mbuf ) - { - return NULL; - } - p_head = rte_pktmbuf_clone(src_mbuf, mbuf_pool); - if (p_head == NULL) - { - return NULL; - } - - return p_head; -} - -// create rte_mbuf refer to data in bsd mbuf. -static inline struct rte_mbuf* ff_bsd_to_rte(void* m, int total) -{ - struct rte_mempool *mbuf_pool = ff_ref_pool[lcore_conf.socket_id]; - struct rte_mbuf *p_head = NULL; - struct rte_mbuf *cur = NULL, *prev = NULL, *tmp=NULL; - void* data = NULL; - void* p_bsdbuf = NULL; - unsigned len = 0; - - p_head = rte_pktmbuf_alloc(mbuf_pool); - if (p_head == NULL) - { - return NULL; - } - p_head->pkt_len = total; - p_head->nb_segs = 0; - cur = p_head; - p_bsdbuf = m; - while( p_bsdbuf ){ - if (cur == NULL) { - cur = rte_pktmbuf_alloc(mbuf_pool); - if (cur == NULL) { - rte_pktmbuf_free(p_head); - return NULL; - } - } - ff_next_mbuf(&p_bsdbuf, &data, &len); // p_bsdbuf move to next mbuf. - cur->buf_addr = data; - cur->buf_physaddr = ff_mem_virt2phy((const void*)(cur->buf_addr)); - cur->data_off = 0; - cur->data_len = len; - - p_head->nb_segs++; - if (prev != NULL) { - prev->next = cur; - } - prev = cur; - cur = NULL; - } - - return p_head; -} - -int ff_dpdk_if_send_ex(struct ff_dpdk_if_context *ctx, void *m, int total) -{ - struct rte_mbuf *head = NULL; - void *src_buf = NULL; - void* p_data = NULL; - struct lcore_conf *qconf = NULL; - unsigned len = 0; - - if ( !m ) - { - rte_log(RTE_LOG_CRIT, RTE_LOGTYPE_USER1, "ff_dpdk_if_send_ex input invalid NULL address."); - return -1; - } - p_data = ff_mbuf_mtod(m); - if ( ff_chk_vma((uint64_t)p_data)) - { - head = ff_bsd_to_rte(m, total); - } - else if( ff_extcl_to_rte(m) ==NULL ) - { - rte_panic("data address 0x%lx is out of page bound or malloc by DPDK recver.", (uint64_t)p_data); - return -1; - } - if (head == NULL) - { - rte_log(RTE_LOG_CRIT, RTE_LOGTYPE_USER1, "ff_dpdk_if_send_ex call ff_bsd_to_rte failed."); - ff_mbuf_free(m); - return -1; - } - - ff_offload_set(ctx, m, head); - qconf = &lcore_conf; - len = qconf->tx_mbufs[ctx->port_id].len; - qconf->tx_mbufs[ctx->port_id].m_table[len] = head; - qconf->tx_mbufs[ctx->port_id].bsd_m_table[len] = m; - len++; - - /* enough pkts to be sent */ - if (unlikely(len == MAX_PKT_BURST)) { - send_burst(qconf, MAX_PKT_BURST, ctx->port_id); - len = 0; - } - qconf->tx_mbufs[ctx->port_id].len = len; - - return 0; -} - -#endif - - diff --git a/lib/ff_host_interface.c b/lib/ff_host_interface.c index 717a14672..61f89d7eb 100644 --- a/lib/ff_host_interface.c +++ b/lib/ff_host_interface.c @@ -56,9 +56,8 @@ ff_mmap(void *addr, uint64_t len, int prot, int flags, int fd, uint64_t offset) int host_prot; int host_flags; -#ifdef _USE_PAGE_ARRAY_ - if( len == 4096 ) - { +#ifdef FF_USE_PAGE_ARRAY + if( len == 4096 ){ return ff_mem_get_page(); } else @@ -88,9 +87,8 @@ ff_mmap(void *addr, uint64_t len, int prot, int flags, int fd, uint64_t offset) int ff_munmap(void *addr, uint64_t len) { -#ifdef _USE_PAGE_ARRAY_ - if ( len == 4096 ) - { +#ifdef FF_USE_PAGE_ARRAY + if ( len == 4096 ){ return ff_mem_free_addr(addr); } #endif diff --git a/lib/ff_memory.c b/lib/ff_memory.c new file mode 100644 index 000000000..f5a7443bc --- /dev/null +++ b/lib/ff_memory.c @@ -0,0 +1,480 @@ +/* + * Copyright (C) 2017 THL A29 Limited, a Tencent company. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ff_dpdk_if.h" +#include "ff_dpdk_pcap.h" +#include "ff_dpdk_kni.h" +#include "ff_config.h" +#include "ff_veth.h" +#include "ff_host_interface.h" +#include "ff_msg.h" +#include "ff_api.h" +#include "ff_memory.h" + +#define PAGE_SIZE 4096 +#define PAGE_SHIFT 12 +#define PAGE_MASK (PAGE_SIZE - 1) +#define trunc_page(x) ((x) & ~PAGE_MASK) +#define round_page(x) (((x) + PAGE_MASK) & ~PAGE_MASK) + +extern struct rte_mempool *pktmbuf_pool[NB_SOCKETS]; +extern struct lcore_conf lcore_conf; + +//struct ff_tx_offload; + +// ff_ref_pool allocate rte_mbuf without data space, which data point to bsd mbuf's data address. +static struct rte_mempool *ff_ref_pool[NB_SOCKETS]; + +#define Head_INC(h) {\ + if ( ++h >= TX_QUEUE_SIZE ) \ + h = 0;\ + }; + +#define Head_DEC(h) do{\ + if ( --h < 0 ) \ + h = TX_QUEUE_SIZE-1;\ + }while(0); + +// bsd mbuf was moved into nic_tx_ring from tmp_tables, after rte_eth_tx_burst() succeed. +static struct mbuf_txring nic_tx_ring[RTE_MAX_ETHPORTS]; +static inline int ff_txring_enqueue(struct mbuf_txring* q, void *p, int seg_num); +static inline void ff_txring_init(struct mbuf_txring* r, uint32_t len); + +typedef struct _list_manager_s +{ + uint64_t *ele; + int size; + //int FreeNum; + int top; +}StackList_t; + +static StackList_t ff_mpage_ctl = {0}; +static uint64_t ff_page_start = NULL, ff_page_end = NULL; +static phys_addr_t *ff_mpage_phy = NULL; + +static inline void *stklist_pop(StackList_t *p); +static inline int stklist_push(StackList_t * p, uint64_t val); + +static int stklist_init(StackList_t*p, int size) +{ + int i = 0; + + if (p==NULL || size<=0){ + return -1; + } + p->size = size; + p->top = 0; + if ( posix_memalign((void**)&p->ele, sizeof(uint64_t), sizeof(uint64_t)*size) != 0) + return -2; + + return 0; +} + +static inline void *stklist_pop(StackList_t *p) +{ + int head = 0; + + if (p==NULL) + return NULL; + + if (p->top > 0 ){ + return (void*)p->ele[--p->top]; + } + else + return NULL; +} + +//id: the id of element to be freed. +//return code: -1: faile; >=0:OK. +static inline int stklist_push(StackList_t *p, const uint64_t val){ + int tail = 0; + + if (p==NULL) + return -1; + if (p->top < p->size){ + p->ele[p->top++] = val; + return 0; + } + else + return -1; +} + +static inline int stklist_size(StackList_t * p) +{ + return p->size; +} + +// set (void*) to rte_mbuf's priv_data. +static inline int ff_mbuf_set_uint64(struct rte_mbuf* p, uint64_t data) +{ + if (rte_pktmbuf_priv_size(p->pool) >= sizeof(uint64_t)) + *((uint64_t*)(p+1)) = data; + return 0; +} + +/************************* +* if mbuf has num segment in all, Dev's sw_ring will use num descriptions. ff_txring also use num segments as below: +* <--- num-1 ---->|ptr| head | +* ---------------------------------------------- +* | 0 | 0 | ..............| 0 | p | XXX | +*----------------------------------------------- +*************************/ +static inline int ff_txring_enqueue(struct mbuf_txring* q, void *p, int seg_num) +{ + int i = 0; + for ( i=0; im_table[q->head] ){ + ff_mbuf_free(q->m_table[q->head]); + q->m_table[q->head] = NULL; + } + Head_INC(q->head); + } + if ( q->m_table[q->head] ) + ff_mbuf_free(q->m_table[q->head]); + q->m_table[q->head] = p; + Head_INC(q->head); + + return 0; +} + +// pop out from head-1 . +static inline int ff_txring_pop(struct mbuf_txring* q, int num) +{ + int i = 0; + + for (i=0; ihead); + if ( (i==0 && q->m_table[q->head]==NULL) || (i>0 && q->m_table[q->head]!=NULL) ){ + rte_panic("ff_txring_pop fatal error!"); + } + if ( q->m_table[q->head] != NULL ){ + ff_mbuf_free(q->m_table[q->head]); + q->m_table[q->head] = NULL; + } + } +} + +static inline void ff_txring_init(struct mbuf_txring* q, uint32_t num) +{ + memset(q, 0, sizeof(struct mbuf_txring)*num); +} + +void ff_init_ref_pool(int nb_mbuf, int socketid) +{ + char s[64] = {0}; + + if (ff_ref_pool[socketid] != NULL) { + return; + } + snprintf(s, sizeof(s), "ff_ref_pool_%d", socketid); + if (rte_eal_process_type() == RTE_PROC_PRIMARY) { + ff_ref_pool[socketid] = rte_pktmbuf_pool_create(s, nb_mbuf, MEMPOOL_CACHE_SIZE, 0, 0, socketid); + } else { + ff_ref_pool[socketid] = rte_mempool_lookup(s); + } +} + +int ff_mmap_init() +{ + int err = 0; + int i = 0; + uint64_t virt_addr = NULL; + phys_addr_t phys_addr = 0; + uint64_t bsd_memsz = (ff_global_cfg.freebsd.mem_size << 20); + unsigned int bsd_pagesz = 0; + + ff_page_start = (uint64_t)mmap( NULL, bsd_memsz, PROT_READ | PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_POPULATE, -1, 0); + if (ff_page_start == (uint64_t)-1){ + rte_panic("ff_mmap_init get ff_page_start failed, err=%d.\n", errno); + return -1; + } + + if ( mlock((void*)ff_page_start, bsd_memsz)<0 ) { + rte_panic("mlock failed, err=%d.\n", errno); + return -1; + } + ff_page_end = ff_page_start + bsd_memsz; + bsd_pagesz = (bsd_memsz>>12); + rte_log(RTE_LOG_INFO, RTE_LOGTYPE_USER1, "ff_mmap_init mmap %d pages, %d MB.\n", bsd_pagesz, ff_global_cfg.freebsd.mem_size); + printf("ff_mmap_init mem[0x%lx:0x%lx]\n", ff_page_start, ff_page_end); + + if (posix_memalign((void**)&ff_mpage_phy, sizeof(phys_addr_t), bsd_pagesz*sizeof(phys_addr_t))!=0){ + rte_panic("posix_memalign get ff_mpage_phy failed, err=%d.\n", errno); + return -1; + } + + stklist_init(&ff_mpage_ctl, bsd_pagesz); + + for (i=0; i ff_page_start && virtaddr < ff_page_end ); +} + +/* + * Get physical address of any mapped virtual address in the current process. + */ +static inline uint64_t ff_mem_virt2phy(const void* virtaddr) +{ + uint64_t addr = 0; + uint32_t pages = 0; + + pages = (((uint64_t)virtaddr - (uint64_t)ff_page_start)>>PAGE_SHIFT); + if (pages >= stklist_size(&ff_mpage_ctl)){ + rte_panic("ff_mbuf_virt2phy get invalid pages %d.", pages); + return -1; + } + + addr = ff_mpage_phy[pages] + ((const uint64_t)virtaddr & PAGE_MASK); + return addr; +} + +void *ff_mem_get_page() +{ + return (void*)stklist_pop(&ff_mpage_ctl); +} + +int ff_mem_free_addr(void *p) +{ + stklist_push(&ff_mpage_ctl, (const uint64_t)p); + return 0; +} + +static inline void ff_offload_set(struct ff_dpdk_if_context *ctx, void *m, struct rte_mbuf *head) +{ + void *data = NULL; + struct ff_tx_offload offload = {0}; + + ff_mbuf_tx_offload(m, &offload); + data = rte_pktmbuf_mtod(head, void*); + + if (offload.ip_csum) { + /* ipv6 not supported yet */ + struct ipv4_hdr *iph; + int iph_len; + iph = (struct ipv4_hdr *)(data + ETHER_HDR_LEN); + iph_len = (iph->version_ihl & 0x0f) << 2; + + head->ol_flags |= PKT_TX_IP_CKSUM | PKT_TX_IPV4; + head->l2_len = ETHER_HDR_LEN; + head->l3_len = iph_len; + } + + if (ctx->hw_features.tx_csum_l4) { + struct ipv4_hdr *iph; + int iph_len; + iph = (struct ipv4_hdr *)(data + ETHER_HDR_LEN); + iph_len = (iph->version_ihl & 0x0f) << 2; + + if (offload.tcp_csum) { + head->ol_flags |= PKT_TX_TCP_CKSUM; + head->l2_len = ETHER_HDR_LEN; + head->l3_len = iph_len; + } + + /* + * TCP segmentation offload. + * + * - set the PKT_TX_TCP_SEG flag in mbuf->ol_flags (this flag + * implies PKT_TX_TCP_CKSUM) + * - set the flag PKT_TX_IPV4 or PKT_TX_IPV6 + * - if it's IPv4, set the PKT_TX_IP_CKSUM flag and + * write the IP checksum to 0 in the packet + * - fill the mbuf offload information: l2_len, + * l3_len, l4_len, tso_segsz + * - calculate the pseudo header checksum without taking ip_len + * in account, and set it in the TCP header. Refer to + * rte_ipv4_phdr_cksum() and rte_ipv6_phdr_cksum() that can be + * used as helpers. + */ + if (offload.tso_seg_size) { + struct tcp_hdr *tcph; + int tcph_len; + tcph = (struct tcp_hdr *)((char *)iph + iph_len); + tcph_len = (tcph->data_off & 0xf0) >> 2; + tcph->cksum = rte_ipv4_phdr_cksum(iph, PKT_TX_TCP_SEG); + + head->ol_flags |= PKT_TX_TCP_SEG; + head->l4_len = tcph_len; + head->tso_segsz = offload.tso_seg_size; + } + + if (offload.udp_csum) { + head->ol_flags |= PKT_TX_UDP_CKSUM; + head->l2_len = ETHER_HDR_LEN; + head->l3_len = iph_len; + } + } +} + +// create rte_buf refer to data which is transmit from bsd stack by EXT_CLUSTER. +static inline struct rte_mbuf* ff_extcl_to_rte(void *m ) +{ + struct rte_mempool *mbuf_pool = pktmbuf_pool[lcore_conf.socket_id]; + struct rte_mbuf *src_mbuf = NULL; + struct rte_mbuf *p_head = NULL; + + src_mbuf = (struct rte_mbuf*)ff_rte_frm_extcl(m); + if ( NULL==src_mbuf ){ + return NULL; + } + p_head = rte_pktmbuf_clone(src_mbuf, mbuf_pool); + if (p_head == NULL){ + return NULL; + } + + return p_head; +} + +// create rte_mbuf refer to data in bsd mbuf. +static inline struct rte_mbuf* ff_bsd_to_rte(void *m, int total) +{ + struct rte_mempool *mbuf_pool = ff_ref_pool[lcore_conf.socket_id]; + struct rte_mbuf *p_head = NULL; + struct rte_mbuf *cur = NULL, *prev = NULL, *tmp=NULL; + void *data = NULL; + void *p_bsdbuf = NULL; + unsigned len = 0; + + p_head = rte_pktmbuf_alloc(mbuf_pool); + if (p_head == NULL){ + return NULL; + } + p_head->pkt_len = total; + p_head->nb_segs = 0; + cur = p_head; + p_bsdbuf = m; + while ( p_bsdbuf ){ + if (cur == NULL) { + cur = rte_pktmbuf_alloc(mbuf_pool); + if (cur == NULL) { + rte_pktmbuf_free(p_head); + return NULL; + } + } + ff_next_mbuf(&p_bsdbuf, &data, &len); // p_bsdbuf move to next mbuf. + cur->buf_addr = data; + cur->buf_physaddr = ff_mem_virt2phy((const void*)(cur->buf_addr)); + cur->data_off = 0; + cur->data_len = len; + + p_head->nb_segs++; + if (prev != NULL) { + prev->next = cur; + } + prev = cur; + cur = NULL; + } + + return p_head; +} + +int ff_if_send_onepkt(struct ff_dpdk_if_context *ctx, void *m, int total) +{ + struct rte_mbuf *head = NULL; + void *src_buf = NULL; + void *p_data = NULL; + struct lcore_conf *qconf = NULL; + unsigned len = 0; + + if ( !m ){ + rte_log(RTE_LOG_CRIT, RTE_LOGTYPE_USER1, "ff_dpdk_if_send_ex input invalid NULL address."); + return 0; + } + p_data = ff_mbuf_mtod(m); + if ( ff_chk_vma((uint64_t)p_data)){ + head = ff_bsd_to_rte(m, total); + } + else if ( (head = ff_extcl_to_rte(m)) == NULL ){ + rte_panic("data address 0x%lx is out of page bound or not malloced by DPDK recver.", (uint64_t)p_data); + return 0; + } + + if (head == NULL){ + rte_log(RTE_LOG_CRIT, RTE_LOGTYPE_USER1, "ff_if_send_onepkt call ff_bsd_to_rte failed."); + ff_mbuf_free(m); + return 0; + } + + ff_offload_set(ctx, m, head); + qconf = &lcore_conf; + len = qconf->tx_mbufs[ctx->port_id].len; + qconf->tx_mbufs[ctx->port_id].m_table[len] = head; + qconf->tx_mbufs[ctx->port_id].bsd_m_table[len] = m; + len++; + + return len; +} + +int ff_enq_tx_bsdmbuf(uint8_t portid, void *p_mbuf, int nb_segs) +{ + return ff_txring_enqueue(&nic_tx_ring[portid], p_mbuf, nb_segs); +} + diff --git a/lib/ff_memory.h b/lib/ff_memory.h new file mode 100644 index 000000000..52fd5fab5 --- /dev/null +++ b/lib/ff_memory.h @@ -0,0 +1,119 @@ +/* + * Copyright (C) 2017 THL A29 Limited, a Tencent company. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#ifndef __FSTACK_MEMORY_H +#define __FSTACK_MEMORY_H + +#ifdef __cplusplus +extern "C" { +#endif + +#define MEMPOOL_CACHE_SIZE 256 + +#define DISPATCH_RING_SIZE 2048 + +#define MSG_RING_SIZE 32 + +/* + * Configurable number of RX/TX ring descriptors + */ +#define RX_QUEUE_SIZE 512 +#define TX_QUEUE_SIZE 512 + +#define MAX_PKT_BURST 32 +#define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */ + +/* + * Try to avoid TX buffering if we have at least MAX_TX_BURST packets to send. + */ +#define MAX_TX_BURST (MAX_PKT_BURST / 2) + +#define NB_SOCKETS 8 + +/* Configure how many packets ahead to prefetch, when reading packets */ +#define PREFETCH_OFFSET 3 + +#define MAX_RX_QUEUE_PER_LCORE 16 +#define MAX_TX_QUEUE_PER_PORT RTE_MAX_ETHPORTS +#define MAX_RX_QUEUE_PER_PORT 128 + +struct ff_dpdk_if_context { + void *sc; + void *ifp; + uint16_t port_id; + struct ff_hw_features hw_features; +} __rte_cache_aligned; + +struct mbuf_table { + uint16_t len; + struct rte_mbuf *m_table[MAX_PKT_BURST]; +#ifdef FF_USE_PAGE_ARRAY + void* bsd_m_table[MAX_PKT_BURST]; // save bsd mbuf address which will be enquene into txring after NIC transmitted pkt. +#endif +}; + +struct lcore_rx_queue { + uint16_t port_id; + uint16_t queue_id; +} __rte_cache_aligned; + +struct lcore_conf { + uint16_t proc_id; + uint16_t socket_id; + uint16_t nb_queue_list[RTE_MAX_ETHPORTS]; + struct ff_port_cfg *port_cfgs; + + uint16_t nb_rx_queue; + struct lcore_rx_queue rx_queue_list[MAX_RX_QUEUE_PER_LCORE]; + uint16_t nb_tx_port; + uint16_t tx_port_id[RTE_MAX_ETHPORTS]; + uint16_t tx_queue_id[RTE_MAX_ETHPORTS]; + struct mbuf_table tx_mbufs[RTE_MAX_ETHPORTS]; + char *pcap[RTE_MAX_ETHPORTS]; +} __rte_cache_aligned; + +#ifdef FF_USE_PAGE_ARRAY +// mbuf_txring save mbuf which had bursted into NIC, m_tables has same length with NIC dev's sw_ring. +// Then when txring.m_table[x] is reused, the packet in txring.m_table[x] had been transmited by NIC. +// that means the mbuf can be freed safely. +struct mbuf_txring{ + void* m_table[TX_QUEUE_SIZE]; + uint16_t head; // next available element. +}; + +void ff_init_ref_pool(int nb_mbuf, int socketid); +int ff_mmap_init(); +int ff_if_send_onepkt(struct ff_dpdk_if_context *ctx, void *m, int total); +int ff_enq_tx_bsdmbuf(uint8_t portid, void *p_mbuf, int nb_segs); +#endif + +#ifdef __cplusplus +} +#endif + +#endif + + diff --git a/lib/ff_veth.c b/lib/ff_veth.c index 5e710c846..3499c0aeb 100644 --- a/lib/ff_veth.c +++ b/lib/ff_veth.c @@ -448,10 +448,9 @@ void * ff_mbuf_mtod(void* bsd_mbuf) // get source rte_mbuf from ext cluster, which carry rte_mbuf while recving pkt, such as arp. void* ff_rte_frm_extcl(void* mbuf) { - struct mbuf* bsd_mbuf = mbuf; + struct mbuf *bsd_mbuf = mbuf; - if ( bsd_mbuf->m_ext.ext_type==EXT_DISPOSABLE && bsd_mbuf->m_ext.ext_free==ff_mbuf_ext_free ) - { + if ( bsd_mbuf->m_ext.ext_type==EXT_DISPOSABLE && bsd_mbuf->m_ext.ext_free==ff_mbuf_ext_free ){ return bsd_mbuf->m_ext.ext_arg1; } else