From d817ab20e7dde0025f090eb592f23591ce0a7008 Mon Sep 17 00:00:00 2001 From: 10077240 Date: Fri, 29 Mar 2019 10:42:02 +0800 Subject: [PATCH] not use mcopy when transmit from bsd to dpdk --- lib/ff_api.symlist | 3 + lib/ff_config.c | 3 + lib/ff_config.h | 1 + lib/ff_dpdk_if.c | 476 ++++++++++++++++++++++++++++++++++++++++ lib/ff_host_interface.c | 18 ++ lib/ff_veth.c | 39 ++++ lib/ff_veth.h | 3 + 7 files changed, 543 insertions(+) diff --git a/lib/ff_api.symlist b/lib/ff_api.symlist index 4c2558ea7..137329034 100755 --- a/lib/ff_api.symlist +++ b/lib/ff_api.symlist @@ -50,3 +50,6 @@ ff_getsockopt_freebsd ff_setsockopt_freebsd ff_dup ff_dup2 +ff_next_mbuf +ff_mbuf_mtod +ff_rte_frm_extcl diff --git a/lib/ff_config.c b/lib/ff_config.c index b3183e659..6802ada25 100644 --- a/lib/ff_config.c +++ b/lib/ff_config.c @@ -506,6 +506,8 @@ ini_parse_handler(void* user, const char* section, const char* name, pconfig->freebsd.physmem = atol(value); } else if (strcmp(name, "fd_reserve") == 0) { pconfig->freebsd.fd_reserve = atoi(value); + } else if (strcmp(name, "memsz_MB") == 0) { + pconfig->freebsd.mem_size = atoi(value); } else { return freebsd_conf_handler(pconfig, "boot", name, value); } @@ -713,6 +715,7 @@ ff_default_config(struct ff_config *cfg) cfg->freebsd.hz = 100; cfg->freebsd.physmem = 1048576*256; cfg->freebsd.fd_reserve = 0; + cfg->freebsd.mem_size =256; } int diff --git a/lib/ff_config.h b/lib/ff_config.h index 621c7845a..f3006e504 100644 --- a/lib/ff_config.h +++ b/lib/ff_config.h @@ -138,6 +138,7 @@ struct ff_config { long physmem; int hz; int fd_reserve; + int mem_size; } freebsd; }; diff --git a/lib/ff_dpdk_if.c b/lib/ff_dpdk_if.c index e7162a432..7646ed7b2 100644 --- a/lib/ff_dpdk_if.c +++ b/lib/ff_dpdk_if.c @@ -25,6 +25,8 @@ */ #include #include +#include +#include #include #include @@ -66,6 +68,12 @@ #define MSG_RING_SIZE 32 +#define PAGE_SIZE 4096 +#define PAGE_SHIFT 12 +#define PAGE_MASK (PAGE_SIZE - 1) +#define trunc_page(x) ((x) & ~PAGE_MASK) +#define round_page(x) (((x) + PAGE_MASK) & ~PAGE_MASK) + /* * Configurable number of RX/TX ring descriptors */ @@ -141,6 +149,10 @@ static struct rte_eth_conf default_port_conf = { struct mbuf_table { uint16_t len; struct rte_mbuf *m_table[MAX_PKT_BURST]; +#ifdef _USE_PAGE_ARRAY_ + void* bsd_m_table[MAX_PKT_BURST]; // save bsd mbuf address which will be freed. +#endif + }; struct lcore_rx_queue { @@ -194,6 +206,50 @@ static struct ff_dpdk_if_context *veth_ctx[RTE_MAX_ETHPORTS]; static struct ff_top_args ff_top_status; static struct ff_traffic_args ff_traffic; +#ifdef _USE_PAGE_ARRAY_ + +// ff_ref_pool allocate rte_mbuf without data space, which data point to bsd mbuf's data address. +static struct rte_mempool *ff_ref_pool[NB_SOCKETS]; + +// mbuf_txring save mbuf which had bursted into NIC, m_tables has same length with NIC dev's sw_ring. +// Then when txring.m_table[x] is reused, the packet in txring.m_table[x] had been transmited by NIC. +// that means the mbuf can be freed safely. +struct mbuf_txring{ + void* m_table[TX_QUEUE_SIZE]; + uint16_t head; // next available element. +}; +#define Head_INC(h) {\ + if ( ++h >= TX_QUEUE_SIZE ) \ + h = 0;\ + }; + +#define Head_DEC(h) do{\ + if ( --h < 0 ) \ + h = TX_QUEUE_SIZE-1;\ + }while(0); + +// bsd mbuf was moved into nic_tx_ring from tmp_tables, after rte_eth_tx_burst() succeed. +static struct mbuf_txring nic_tx_ring[RTE_MAX_ETHPORTS]; +static inline int ff_txring_enqueue(struct mbuf_txring* q, void *p, int seg_num); +static inline void ff_txring_init(struct mbuf_txring* r, uint32_t len); +static int ff_dpdk_if_send_ex(struct ff_dpdk_if_context *ctx, void *m, int total); +static int ff_mmap_init(); + +typedef struct _list_manager_s +{ + uint64_t *ele; + int size; + //int FreeNum; + int top; +}StackList_t; + +static StackList_t ff_mpage_ctl = {0}; +static uint64_t ff_page_start = NULL, ff_page_end = NULL; +static phys_addr_t* ff_mpage_phy = NULL; +static inline void* StkList_pop(StackList_t *p); +static inline int StkList_push(StackList_t * p, uint64_t val); +#endif + extern void ff_hardclock(void); static void @@ -409,6 +465,23 @@ init_mem_pool(void) } else { printf("create mbuf pool on socket %d\n", socketid); } + +#ifdef _USE_PAGE_ARRAY_ + if (ff_ref_pool[socketid] != NULL) { + continue; + } + nb_mbuf = RTE_MAX ( + nb_ports*nb_lcores*MAX_PKT_BURST + + nb_ports*nb_tx_queue*TX_QUEUE_SIZE + + nb_lcores*MEMPOOL_CACHE_SIZE, + (unsigned)4096); + snprintf(s, sizeof(s), "ff_ref_pool_%d", socketid); + if (rte_eal_process_type() == RTE_PROC_PRIMARY) { + ff_ref_pool[socketid] = rte_pktmbuf_pool_create(s, nb_mbuf, MEMPOOL_CACHE_SIZE, 0, 0, socketid); + } else { + ff_ref_pool[socketid] = rte_mempool_lookup(s); + } +#endif } return 0; @@ -844,6 +917,11 @@ ff_dpdk_init(int argc, char **argv) } #endif +#ifdef _USE_PAGE_ARRAY_ + ff_mmap_init(); +#endif + + ret = init_port_start(); if (ret < 0) { rte_exit(EXIT_FAILURE, "init_port_start failed\n"); @@ -1283,10 +1361,18 @@ send_burst(struct lcore_conf *qconf, uint16_t n, uint8_t port) uint16_t i; for (i = 0; i < ret; i++) { ff_traffic.tx_bytes += rte_pktmbuf_pkt_len(m_table[i]); +#ifdef _USE_PAGE_ARRAY_ + if (qconf->tx_mbufs[port].bsd_m_table[i]) + ff_txring_enqueue(&nic_tx_ring[port], qconf->tx_mbufs[port].bsd_m_table[i], m_table[i]->nb_segs); +#endif } if (unlikely(ret < n)) { do { rte_pktmbuf_free(m_table[ret]); +#ifdef _USE_PAGE_ARRAY_ + if ( qconf->tx_mbufs[port].bsd_m_table[ret] ) + ff_mbuf_free(qconf->tx_mbufs[port].bsd_m_table[ret]); +#endif } while (++ret < n); } return 0; @@ -1318,6 +1404,9 @@ int ff_dpdk_if_send(struct ff_dpdk_if_context *ctx, void *m, int total) { +#ifdef _USE_PAGE_ARRAY_ + return ff_dpdk_if_send_ex(ctx, m,total); +#endif struct rte_mempool *mbuf_pool = pktmbuf_pool[lcore_conf.socket_id]; struct rte_mbuf *head = rte_pktmbuf_alloc(mbuf_pool); if (head == NULL) { @@ -1669,3 +1758,390 @@ ff_get_tsc_ns() return ((double)cur_tsc/(double)hz) * NS_PER_S; } +#ifdef _USE_PAGE_ARRAY_ +static int StkList_init(StackList_t*p, int size) +{ + int i = 0; + + if (p==NULL || size<=0) + { + return -1; + } + p->size = size; + p->top = 0; + if ( posix_memalign((void**)&p->ele, sizeof(uint64_t), sizeof(uint64_t)*size) != 0) + return -2; + + return 0; +} + +static inline void* StkList_pop(StackList_t *p) +{ + int head = 0; + + if(p==NULL) + return NULL; + + if (p->top > 0 ) + { + return (void*)p->ele[--p->top]; + } + else + return NULL; +} + +//id: the id of element to be freed. +//return code: -1: faile; >=0:OK. +static inline int StkList_push(StackList_t *p, const uint64_t val) +{ + int tail = 0; + + if(p==NULL) + return -1; + if (p->top < p->size) + { + p->ele[p->top++] = val; + return 0; + } + else + return -1; +} + +static int StkList_Size(StackList_t * p) +{ + return p->size; +} + +// set (void*) to rte_mbuf's priv_data. +static inline int ff_mbuf_set_uint64(struct rte_mbuf* p, uint64_t data) +{ + if (rte_pktmbuf_priv_size(p->pool) >= sizeof(uint64_t)) + *((uint64_t*)(p+1)) = data; + return 0; +} + +/************************* +* if mbuf has num segment in all, Dev's sw_ring will use num descriptions. ff_txring also use num segments as below: +* <--- num-1 ---->|ptr| head | +* ---------------------------------------------- +* | 0 | 0 | ..............| 0 | p | XXX | +*----------------------------------------------- +*************************/ +static inline int ff_txring_enqueue(struct mbuf_txring* q, void *p, int seg_num) +{ + int i = 0; + for( i=0; im_table[q->head] ) + { + ff_mbuf_free(q->m_table[q->head]); + q->m_table[q->head] = NULL; + } + Head_INC(q->head); + } + if ( q->m_table[q->head] ) + ff_mbuf_free(q->m_table[q->head]); + q->m_table[q->head] = p; + Head_INC(q->head); + + return 0; +} + +// pop out from head-1 . +static inline int ff_txring_pop(struct mbuf_txring* q, int num) +{ + int i = 0; + + for (i=0; ihead); + if ( (i==0 && q->m_table[q->head]==NULL) || (i>0 && q->m_table[q->head]!=NULL) ) + { + rte_panic("ff_txring_pop fatal error!"); + } + if ( q->m_table[q->head] != NULL ) + { + ff_mbuf_free(q->m_table[q->head]); + q->m_table[q->head] = NULL; + } + } +} + +static inline void ff_txring_init(struct mbuf_txring* q, uint32_t num) +{ + memset(q, 0, sizeof(struct mbuf_txring)*num); +} + +static int ff_mmap_init() +{ + int err = 0; + int i = 0; + uint64_t virt_addr = NULL; + phys_addr_t phys_addr = 0; + uint64_t bsd_memsz = (ff_global_cfg.freebsd.mem_size << 20); + unsigned int bsd_pagesz = 0; + + ff_page_start = (uint64_t)mmap( NULL, bsd_memsz, PROT_READ | PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_POPULATE, -1, 0); + if (ff_page_start == (uint64_t)-1) + { + rte_panic("ff_mmap_init get ff_page_start failed, err=%d.\n", errno); + return -1; + } + + if ( mlock((void*)ff_page_start, bsd_memsz)<0 ) + { + rte_panic("mlock failed, err=%d.\n", errno); + return -1; + } + ff_page_end = ff_page_start + bsd_memsz; + + rte_log(RTE_LOG_INFO, RTE_LOGTYPE_USER1, "ff_mmap_init mmap %d pages, %d MB.\n", bsd_pagesz, ff_global_cfg.freebsd.mem_size); + printf("ff_mmap_init mem[0x%lx:0x%lx]\n", ff_page_start, ff_page_end); + + bsd_pagesz = (bsd_memsz>>12); + if(posix_memalign((void**)&ff_mpage_phy, sizeof(phys_addr_t), bsd_pagesz*sizeof(phys_addr_t))!=0) + { + rte_panic("posix_memalign get ff_mpage_phy failed, err=%d.\n", errno); + return -1; + } + + StkList_init(&ff_mpage_ctl, bsd_pagesz); + + for (i=0; i ff_page_start && virtaddr < ff_page_end ); +} + +/* + * Get physical address of any mapped virtual address in the current process. + */ +static inline uint64_t ff_mem_virt2phy(const void* virtaddr) +{ + uint64_t addr = 0; + uint32_t pages = 0; + + pages = (((uint64_t)virtaddr - (uint64_t)ff_page_start)>>PAGE_SHIFT); + if (pages >= StkList_Size(&ff_mpage_ctl)) + { + rte_panic("ff_mbuf_virt2phy get invalid pages %d.", pages); + return -1; + } + + addr = ff_mpage_phy[pages] + ((const uint64_t)virtaddr & PAGE_MASK); + return addr; +} + +void* ff_mem_get_page() +{ + return (void*)StkList_pop(&ff_mpage_ctl); +} + +int ff_mem_free_addr(void* p) +{ + StkList_push(&ff_mpage_ctl, (const uint64_t)p); + return 0; +} + +static inline void ff_offload_set(struct ff_dpdk_if_context *ctx, void* m, struct rte_mbuf *head) +{ + void* data = NULL; + struct ff_tx_offload offload = {0}; + + ff_mbuf_tx_offload(m, &offload); + data = rte_pktmbuf_mtod(head, void*); + + if (offload.ip_csum) { + /* ipv6 not supported yet */ + struct ipv4_hdr *iph; + int iph_len; + iph = (struct ipv4_hdr *)(data + ETHER_HDR_LEN); + iph_len = (iph->version_ihl & 0x0f) << 2; + + head->ol_flags |= PKT_TX_IP_CKSUM | PKT_TX_IPV4; + head->l2_len = ETHER_HDR_LEN; + head->l3_len = iph_len; + } + + if (ctx->hw_features.tx_csum_l4) { + struct ipv4_hdr *iph; + int iph_len; + iph = (struct ipv4_hdr *)(data + ETHER_HDR_LEN); + iph_len = (iph->version_ihl & 0x0f) << 2; + + if (offload.tcp_csum) { + head->ol_flags |= PKT_TX_TCP_CKSUM; + head->l2_len = ETHER_HDR_LEN; + head->l3_len = iph_len; + } + + /* + * TCP segmentation offload. + * + * - set the PKT_TX_TCP_SEG flag in mbuf->ol_flags (this flag + * implies PKT_TX_TCP_CKSUM) + * - set the flag PKT_TX_IPV4 or PKT_TX_IPV6 + * - if it's IPv4, set the PKT_TX_IP_CKSUM flag and + * write the IP checksum to 0 in the packet + * - fill the mbuf offload information: l2_len, + * l3_len, l4_len, tso_segsz + * - calculate the pseudo header checksum without taking ip_len + * in account, and set it in the TCP header. Refer to + * rte_ipv4_phdr_cksum() and rte_ipv6_phdr_cksum() that can be + * used as helpers. + */ + if (offload.tso_seg_size) { + struct tcp_hdr *tcph; + int tcph_len; + tcph = (struct tcp_hdr *)((char *)iph + iph_len); + tcph_len = (tcph->data_off & 0xf0) >> 2; + tcph->cksum = rte_ipv4_phdr_cksum(iph, PKT_TX_TCP_SEG); + + head->ol_flags |= PKT_TX_TCP_SEG; + head->l4_len = tcph_len; + head->tso_segsz = offload.tso_seg_size; + } + + if (offload.udp_csum) { + head->ol_flags |= PKT_TX_UDP_CKSUM; + head->l2_len = ETHER_HDR_LEN; + head->l3_len = iph_len; + } + } +} + +// create rte_buf refer to data which is transmit from bsd stack by EXT_CLUSTER. +static inline struct rte_mbuf* ff_extcl_to_rte(void* m) +{ + struct rte_mempool *mbuf_pool = pktmbuf_pool[lcore_conf.socket_id]; + struct rte_mbuf* src_mbuf = NULL; + struct rte_mbuf *p_head = NULL; + + src_mbuf = (struct rte_mbuf*)ff_rte_frm_extcl(m); + if ( NULL==src_mbuf ) + { + return NULL; + } + p_head = rte_pktmbuf_clone(src_mbuf, mbuf_pool); + if (p_head == NULL) + { + return NULL; + } + + return p_head; +} + +// create rte_mbuf refer to data in bsd mbuf. +static inline struct rte_mbuf* ff_bsd_to_rte(void* m, int total) +{ + struct rte_mempool *mbuf_pool = ff_ref_pool[lcore_conf.socket_id]; + struct rte_mbuf *p_head = NULL; + struct rte_mbuf *cur = NULL, *prev = NULL, *tmp=NULL; + void* data = NULL; + void* p_bsdbuf = NULL; + unsigned len = 0; + + p_head = rte_pktmbuf_alloc(mbuf_pool); + if (p_head == NULL) + { + return NULL; + } + p_head->pkt_len = total; + p_head->nb_segs = 0; + cur = p_head; + p_bsdbuf = m; + while( p_bsdbuf ){ + if (cur == NULL) { + cur = rte_pktmbuf_alloc(mbuf_pool); + if (cur == NULL) { + rte_pktmbuf_free(p_head); + return NULL; + } + } + ff_next_mbuf(&p_bsdbuf, &data, &len); // p_bsdbuf move to next mbuf. + cur->buf_addr = data; + cur->buf_physaddr = ff_mem_virt2phy((const void*)(cur->buf_addr)); + cur->data_off = 0; + cur->data_len = len; + + p_head->nb_segs++; + if (prev != NULL) { + prev->next = cur; + } + prev = cur; + cur = NULL; + } + + return p_head; +} + +int ff_dpdk_if_send_ex(struct ff_dpdk_if_context *ctx, void *m, int total) +{ + struct rte_mbuf *head = NULL; + void *src_buf = NULL; + void* p_data = NULL; + struct lcore_conf *qconf = NULL; + unsigned len = 0; + + if ( !m ) + { + rte_log(RTE_LOG_CRIT, RTE_LOGTYPE_USER1, "ff_dpdk_if_send_ex input invalid NULL address."); + return -1; + } + p_data = ff_mbuf_mtod(m); + if ( ff_chk_vma((uint64_t)p_data)) + { + head = ff_bsd_to_rte(m, total); + } + else if( ff_extcl_to_rte(m) ==NULL ) + { + rte_panic("data address 0x%lx is out of page bound or malloc by DPDK recver.", (uint64_t)p_data); + return -1; + } + if (head == NULL) + { + rte_log(RTE_LOG_CRIT, RTE_LOGTYPE_USER1, "ff_dpdk_if_send_ex call ff_bsd_to_rte failed."); + ff_mbuf_free(m); + return -1; + } + + ff_offload_set(ctx, m, head); + qconf = &lcore_conf; + len = qconf->tx_mbufs[ctx->port_id].len; + qconf->tx_mbufs[ctx->port_id].m_table[len] = head; + qconf->tx_mbufs[ctx->port_id].bsd_m_table[len] = m; + len++; + + /* enough pkts to be sent */ + if (unlikely(len == MAX_PKT_BURST)) { + send_burst(qconf, MAX_PKT_BURST, ctx->port_id); + len = 0; + } + qconf->tx_mbufs[ctx->port_id].len = len; + + return 0; +} + +#endif + + + diff --git a/lib/ff_host_interface.c b/lib/ff_host_interface.c index 0b239d32e..717a14672 100644 --- a/lib/ff_host_interface.c +++ b/lib/ff_host_interface.c @@ -46,6 +46,8 @@ #include "ff_errno.h" static struct timespec current_ts; +extern void* ff_mem_get_page(); +extern int ff_mem_free_addr(void* p); void * ff_mmap(void *addr, uint64_t len, int prot, int flags, int fd, uint64_t offset) @@ -54,6 +56,15 @@ ff_mmap(void *addr, uint64_t len, int prot, int flags, int fd, uint64_t offset) int host_prot; int host_flags; +#ifdef _USE_PAGE_ARRAY_ + if( len == 4096 ) + { + return ff_mem_get_page(); + } + else +#endif + { + assert(ff_PROT_NONE == PROT_NONE); host_prot = 0; if ((prot & ff_PROT_READ) == ff_PROT_READ) host_prot |= PROT_READ; @@ -71,11 +82,18 @@ ff_mmap(void *addr, uint64_t len, int prot, int flags, int fd, uint64_t offset) exit(1); } return ret; + } } int ff_munmap(void *addr, uint64_t len) { +#ifdef _USE_PAGE_ARRAY_ + if ( len == 4096 ) + { + return ff_mem_free_addr(addr); + } +#endif //rte_free(addr); //return 0; return (munmap(addr, len)); diff --git a/lib/ff_veth.c b/lib/ff_veth.c index 0f0b09efd..5e710c846 100644 --- a/lib/ff_veth.c +++ b/lib/ff_veth.c @@ -420,3 +420,42 @@ ff_veth_softc_to_hostc(void *softc) return (void *)sc->host_ctx; } +/******************** +* get next mbuf's addr, current mbuf's data and datalen. +* +********************/ +int ff_next_mbuf(void **mbuf_bsd, void **data, unsigned *len) +{ + struct mbuf *mb = *(struct mbuf **)mbuf_bsd; + + *len = mb->m_len; + *data = mb->m_data; + + if (mb->m_next) + *mbuf_bsd = mb->m_next; + else + *mbuf_bsd = NULL; + return 0; +} + +void * ff_mbuf_mtod(void* bsd_mbuf) +{ + if ( !bsd_mbuf ) + return NULL; + return (void*)((struct mbuf *)bsd_mbuf)->m_data; +} + +// get source rte_mbuf from ext cluster, which carry rte_mbuf while recving pkt, such as arp. +void* ff_rte_frm_extcl(void* mbuf) +{ + struct mbuf* bsd_mbuf = mbuf; + + if ( bsd_mbuf->m_ext.ext_type==EXT_DISPOSABLE && bsd_mbuf->m_ext.ext_free==ff_mbuf_ext_free ) + { + return bsd_mbuf->m_ext.ext_arg1; + } + else + return NULL; +} + + diff --git a/lib/ff_veth.h b/lib/ff_veth.h index 7cbeecc3c..ff5a728af 100644 --- a/lib/ff_veth.h +++ b/lib/ff_veth.h @@ -37,6 +37,9 @@ void *ff_mbuf_get(void *m, void *data, uint16_t len); void ff_mbuf_free(void *m); int ff_mbuf_copydata(void *m, void *data, int off, int len); +int ff_next_mbuf(void **mbuf_bsd, void **data, unsigned *len); +void* ff_mbuf_mtod(void* bsd_mbuf); +void* ff_rte_frm_extcl(void* mbuf); struct ff_tx_offload; void ff_mbuf_tx_offload(void *m, struct ff_tx_offload *offload);