not use mcopy when transmit from bsd to dpdk

This commit is contained in:
10077240 2019-03-29 10:42:02 +08:00
parent 31b46e5735
commit d817ab20e7
7 changed files with 543 additions and 0 deletions

View File

@ -50,3 +50,6 @@ ff_getsockopt_freebsd
ff_setsockopt_freebsd
ff_dup
ff_dup2
ff_next_mbuf
ff_mbuf_mtod
ff_rte_frm_extcl

View File

@ -506,6 +506,8 @@ ini_parse_handler(void* user, const char* section, const char* name,
pconfig->freebsd.physmem = atol(value);
} else if (strcmp(name, "fd_reserve") == 0) {
pconfig->freebsd.fd_reserve = atoi(value);
} else if (strcmp(name, "memsz_MB") == 0) {
pconfig->freebsd.mem_size = atoi(value);
} else {
return freebsd_conf_handler(pconfig, "boot", name, value);
}
@ -713,6 +715,7 @@ ff_default_config(struct ff_config *cfg)
cfg->freebsd.hz = 100;
cfg->freebsd.physmem = 1048576*256;
cfg->freebsd.fd_reserve = 0;
cfg->freebsd.mem_size =256;
}
int

View File

@ -138,6 +138,7 @@ struct ff_config {
long physmem;
int hz;
int fd_reserve;
int mem_size;
} freebsd;
};

View File

@ -25,6 +25,8 @@
*/
#include <assert.h>
#include <unistd.h>
#include <sys/mman.h>
#include <errno.h>
#include <rte_common.h>
#include <rte_byteorder.h>
@ -66,6 +68,12 @@
#define MSG_RING_SIZE 32
#define PAGE_SIZE 4096
#define PAGE_SHIFT 12
#define PAGE_MASK (PAGE_SIZE - 1)
#define trunc_page(x) ((x) & ~PAGE_MASK)
#define round_page(x) (((x) + PAGE_MASK) & ~PAGE_MASK)
/*
* Configurable number of RX/TX ring descriptors
*/
@ -141,6 +149,10 @@ static struct rte_eth_conf default_port_conf = {
struct mbuf_table {
uint16_t len;
struct rte_mbuf *m_table[MAX_PKT_BURST];
#ifdef _USE_PAGE_ARRAY_
void* bsd_m_table[MAX_PKT_BURST]; // save bsd mbuf address which will be freed.
#endif
};
struct lcore_rx_queue {
@ -194,6 +206,50 @@ static struct ff_dpdk_if_context *veth_ctx[RTE_MAX_ETHPORTS];
static struct ff_top_args ff_top_status;
static struct ff_traffic_args ff_traffic;
#ifdef _USE_PAGE_ARRAY_
// ff_ref_pool allocate rte_mbuf without data space, which data point to bsd mbuf's data address.
static struct rte_mempool *ff_ref_pool[NB_SOCKETS];
// mbuf_txring save mbuf which had bursted into NIC, m_tables has same length with NIC dev's sw_ring.
// Then when txring.m_table[x] is reused, the packet in txring.m_table[x] had been transmited by NIC.
// that means the mbuf can be freed safely.
struct mbuf_txring{
void* m_table[TX_QUEUE_SIZE];
uint16_t head; // next available element.
};
#define Head_INC(h) {\
if ( ++h >= TX_QUEUE_SIZE ) \
h = 0;\
};
#define Head_DEC(h) do{\
if ( --h < 0 ) \
h = TX_QUEUE_SIZE-1;\
}while(0);
// bsd mbuf was moved into nic_tx_ring from tmp_tables, after rte_eth_tx_burst() succeed.
static struct mbuf_txring nic_tx_ring[RTE_MAX_ETHPORTS];
static inline int ff_txring_enqueue(struct mbuf_txring* q, void *p, int seg_num);
static inline void ff_txring_init(struct mbuf_txring* r, uint32_t len);
static int ff_dpdk_if_send_ex(struct ff_dpdk_if_context *ctx, void *m, int total);
static int ff_mmap_init();
typedef struct _list_manager_s
{
uint64_t *ele;
int size;
//int FreeNum;
int top;
}StackList_t;
static StackList_t ff_mpage_ctl = {0};
static uint64_t ff_page_start = NULL, ff_page_end = NULL;
static phys_addr_t* ff_mpage_phy = NULL;
static inline void* StkList_pop(StackList_t *p);
static inline int StkList_push(StackList_t * p, uint64_t val);
#endif
extern void ff_hardclock(void);
static void
@ -409,6 +465,23 @@ init_mem_pool(void)
} else {
printf("create mbuf pool on socket %d\n", socketid);
}
#ifdef _USE_PAGE_ARRAY_
if (ff_ref_pool[socketid] != NULL) {
continue;
}
nb_mbuf = RTE_MAX (
nb_ports*nb_lcores*MAX_PKT_BURST +
nb_ports*nb_tx_queue*TX_QUEUE_SIZE +
nb_lcores*MEMPOOL_CACHE_SIZE,
(unsigned)4096);
snprintf(s, sizeof(s), "ff_ref_pool_%d", socketid);
if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
ff_ref_pool[socketid] = rte_pktmbuf_pool_create(s, nb_mbuf, MEMPOOL_CACHE_SIZE, 0, 0, socketid);
} else {
ff_ref_pool[socketid] = rte_mempool_lookup(s);
}
#endif
}
return 0;
@ -844,6 +917,11 @@ ff_dpdk_init(int argc, char **argv)
}
#endif
#ifdef _USE_PAGE_ARRAY_
ff_mmap_init();
#endif
ret = init_port_start();
if (ret < 0) {
rte_exit(EXIT_FAILURE, "init_port_start failed\n");
@ -1283,10 +1361,18 @@ send_burst(struct lcore_conf *qconf, uint16_t n, uint8_t port)
uint16_t i;
for (i = 0; i < ret; i++) {
ff_traffic.tx_bytes += rte_pktmbuf_pkt_len(m_table[i]);
#ifdef _USE_PAGE_ARRAY_
if (qconf->tx_mbufs[port].bsd_m_table[i])
ff_txring_enqueue(&nic_tx_ring[port], qconf->tx_mbufs[port].bsd_m_table[i], m_table[i]->nb_segs);
#endif
}
if (unlikely(ret < n)) {
do {
rte_pktmbuf_free(m_table[ret]);
#ifdef _USE_PAGE_ARRAY_
if ( qconf->tx_mbufs[port].bsd_m_table[ret] )
ff_mbuf_free(qconf->tx_mbufs[port].bsd_m_table[ret]);
#endif
} while (++ret < n);
}
return 0;
@ -1318,6 +1404,9 @@ int
ff_dpdk_if_send(struct ff_dpdk_if_context *ctx, void *m,
int total)
{
#ifdef _USE_PAGE_ARRAY_
return ff_dpdk_if_send_ex(ctx, m,total);
#endif
struct rte_mempool *mbuf_pool = pktmbuf_pool[lcore_conf.socket_id];
struct rte_mbuf *head = rte_pktmbuf_alloc(mbuf_pool);
if (head == NULL) {
@ -1669,3 +1758,390 @@ ff_get_tsc_ns()
return ((double)cur_tsc/(double)hz) * NS_PER_S;
}
#ifdef _USE_PAGE_ARRAY_
static int StkList_init(StackList_t*p, int size)
{
int i = 0;
if (p==NULL || size<=0)
{
return -1;
}
p->size = size;
p->top = 0;
if ( posix_memalign((void**)&p->ele, sizeof(uint64_t), sizeof(uint64_t)*size) != 0)
return -2;
return 0;
}
static inline void* StkList_pop(StackList_t *p)
{
int head = 0;
if(p==NULL)
return NULL;
if (p->top > 0 )
{
return (void*)p->ele[--p->top];
}
else
return NULL;
}
//id: the id of element to be freed.
//return code: -1: faile; >=0:OK.
static inline int StkList_push(StackList_t *p, const uint64_t val)
{
int tail = 0;
if(p==NULL)
return -1;
if (p->top < p->size)
{
p->ele[p->top++] = val;
return 0;
}
else
return -1;
}
static int StkList_Size(StackList_t * p)
{
return p->size;
}
// set (void*) to rte_mbuf's priv_data.
static inline int ff_mbuf_set_uint64(struct rte_mbuf* p, uint64_t data)
{
if (rte_pktmbuf_priv_size(p->pool) >= sizeof(uint64_t))
*((uint64_t*)(p+1)) = data;
return 0;
}
/*************************
* if mbuf has num segment in all, Dev's sw_ring will use num descriptions. ff_txring also use num segments as below:
* <--- num-1 ---->|ptr| head |
* ----------------------------------------------
* | 0 | 0 | ..............| 0 | p | XXX |
*-----------------------------------------------
*************************/
static inline int ff_txring_enqueue(struct mbuf_txring* q, void *p, int seg_num)
{
int i = 0;
for( i=0; i<seg_num-1; i++)
{
if ( q->m_table[q->head] )
{
ff_mbuf_free(q->m_table[q->head]);
q->m_table[q->head] = NULL;
}
Head_INC(q->head);
}
if ( q->m_table[q->head] )
ff_mbuf_free(q->m_table[q->head]);
q->m_table[q->head] = p;
Head_INC(q->head);
return 0;
}
// pop out from head-1 .
static inline int ff_txring_pop(struct mbuf_txring* q, int num)
{
int i = 0;
for (i=0; i<num; i++)
{
Head_DEC(q->head);
if ( (i==0 && q->m_table[q->head]==NULL) || (i>0 && q->m_table[q->head]!=NULL) )
{
rte_panic("ff_txring_pop fatal error!");
}
if ( q->m_table[q->head] != NULL )
{
ff_mbuf_free(q->m_table[q->head]);
q->m_table[q->head] = NULL;
}
}
}
static inline void ff_txring_init(struct mbuf_txring* q, uint32_t num)
{
memset(q, 0, sizeof(struct mbuf_txring)*num);
}
static int ff_mmap_init()
{
int err = 0;
int i = 0;
uint64_t virt_addr = NULL;
phys_addr_t phys_addr = 0;
uint64_t bsd_memsz = (ff_global_cfg.freebsd.mem_size << 20);
unsigned int bsd_pagesz = 0;
ff_page_start = (uint64_t)mmap( NULL, bsd_memsz, PROT_READ | PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_POPULATE, -1, 0);
if (ff_page_start == (uint64_t)-1)
{
rte_panic("ff_mmap_init get ff_page_start failed, err=%d.\n", errno);
return -1;
}
if ( mlock((void*)ff_page_start, bsd_memsz)<0 )
{
rte_panic("mlock failed, err=%d.\n", errno);
return -1;
}
ff_page_end = ff_page_start + bsd_memsz;
rte_log(RTE_LOG_INFO, RTE_LOGTYPE_USER1, "ff_mmap_init mmap %d pages, %d MB.\n", bsd_pagesz, ff_global_cfg.freebsd.mem_size);
printf("ff_mmap_init mem[0x%lx:0x%lx]\n", ff_page_start, ff_page_end);
bsd_pagesz = (bsd_memsz>>12);
if(posix_memalign((void**)&ff_mpage_phy, sizeof(phys_addr_t), bsd_pagesz*sizeof(phys_addr_t))!=0)
{
rte_panic("posix_memalign get ff_mpage_phy failed, err=%d.\n", errno);
return -1;
}
StkList_init(&ff_mpage_ctl, bsd_pagesz);
for (i=0; i<bsd_pagesz; i++ )
{
virt_addr = ff_page_start + PAGE_SIZE*i;
memset((void*)virt_addr, 0, PAGE_SIZE);
StkList_push( &ff_mpage_ctl, virt_addr);
ff_mpage_phy[i] = rte_mem_virt2phy((const void*)virt_addr);
if ( ff_mpage_phy[i] == RTE_BAD_IOVA )
{
rte_panic("rte_mem_virt2phy return invalid address.");
return -1;
}
}
ff_txring_init(&nic_tx_ring[0], RTE_MAX_ETHPORTS);
return 0;
}
// 1: vma in fstack page table; 0: vma not in fstack pages, in DPDK pool.
static inline int ff_chk_vma(const uint64_t virtaddr)
{
return !!( virtaddr > ff_page_start && virtaddr < ff_page_end );
}
/*
* Get physical address of any mapped virtual address in the current process.
*/
static inline uint64_t ff_mem_virt2phy(const void* virtaddr)
{
uint64_t addr = 0;
uint32_t pages = 0;
pages = (((uint64_t)virtaddr - (uint64_t)ff_page_start)>>PAGE_SHIFT);
if (pages >= StkList_Size(&ff_mpage_ctl))
{
rte_panic("ff_mbuf_virt2phy get invalid pages %d.", pages);
return -1;
}
addr = ff_mpage_phy[pages] + ((const uint64_t)virtaddr & PAGE_MASK);
return addr;
}
void* ff_mem_get_page()
{
return (void*)StkList_pop(&ff_mpage_ctl);
}
int ff_mem_free_addr(void* p)
{
StkList_push(&ff_mpage_ctl, (const uint64_t)p);
return 0;
}
static inline void ff_offload_set(struct ff_dpdk_if_context *ctx, void* m, struct rte_mbuf *head)
{
void* data = NULL;
struct ff_tx_offload offload = {0};
ff_mbuf_tx_offload(m, &offload);
data = rte_pktmbuf_mtod(head, void*);
if (offload.ip_csum) {
/* ipv6 not supported yet */
struct ipv4_hdr *iph;
int iph_len;
iph = (struct ipv4_hdr *)(data + ETHER_HDR_LEN);
iph_len = (iph->version_ihl & 0x0f) << 2;
head->ol_flags |= PKT_TX_IP_CKSUM | PKT_TX_IPV4;
head->l2_len = ETHER_HDR_LEN;
head->l3_len = iph_len;
}
if (ctx->hw_features.tx_csum_l4) {
struct ipv4_hdr *iph;
int iph_len;
iph = (struct ipv4_hdr *)(data + ETHER_HDR_LEN);
iph_len = (iph->version_ihl & 0x0f) << 2;
if (offload.tcp_csum) {
head->ol_flags |= PKT_TX_TCP_CKSUM;
head->l2_len = ETHER_HDR_LEN;
head->l3_len = iph_len;
}
/*
* TCP segmentation offload.
*
* - set the PKT_TX_TCP_SEG flag in mbuf->ol_flags (this flag
* implies PKT_TX_TCP_CKSUM)
* - set the flag PKT_TX_IPV4 or PKT_TX_IPV6
* - if it's IPv4, set the PKT_TX_IP_CKSUM flag and
* write the IP checksum to 0 in the packet
* - fill the mbuf offload information: l2_len,
* l3_len, l4_len, tso_segsz
* - calculate the pseudo header checksum without taking ip_len
* in account, and set it in the TCP header. Refer to
* rte_ipv4_phdr_cksum() and rte_ipv6_phdr_cksum() that can be
* used as helpers.
*/
if (offload.tso_seg_size) {
struct tcp_hdr *tcph;
int tcph_len;
tcph = (struct tcp_hdr *)((char *)iph + iph_len);
tcph_len = (tcph->data_off & 0xf0) >> 2;
tcph->cksum = rte_ipv4_phdr_cksum(iph, PKT_TX_TCP_SEG);
head->ol_flags |= PKT_TX_TCP_SEG;
head->l4_len = tcph_len;
head->tso_segsz = offload.tso_seg_size;
}
if (offload.udp_csum) {
head->ol_flags |= PKT_TX_UDP_CKSUM;
head->l2_len = ETHER_HDR_LEN;
head->l3_len = iph_len;
}
}
}
// create rte_buf refer to data which is transmit from bsd stack by EXT_CLUSTER.
static inline struct rte_mbuf* ff_extcl_to_rte(void* m)
{
struct rte_mempool *mbuf_pool = pktmbuf_pool[lcore_conf.socket_id];
struct rte_mbuf* src_mbuf = NULL;
struct rte_mbuf *p_head = NULL;
src_mbuf = (struct rte_mbuf*)ff_rte_frm_extcl(m);
if ( NULL==src_mbuf )
{
return NULL;
}
p_head = rte_pktmbuf_clone(src_mbuf, mbuf_pool);
if (p_head == NULL)
{
return NULL;
}
return p_head;
}
// create rte_mbuf refer to data in bsd mbuf.
static inline struct rte_mbuf* ff_bsd_to_rte(void* m, int total)
{
struct rte_mempool *mbuf_pool = ff_ref_pool[lcore_conf.socket_id];
struct rte_mbuf *p_head = NULL;
struct rte_mbuf *cur = NULL, *prev = NULL, *tmp=NULL;
void* data = NULL;
void* p_bsdbuf = NULL;
unsigned len = 0;
p_head = rte_pktmbuf_alloc(mbuf_pool);
if (p_head == NULL)
{
return NULL;
}
p_head->pkt_len = total;
p_head->nb_segs = 0;
cur = p_head;
p_bsdbuf = m;
while( p_bsdbuf ){
if (cur == NULL) {
cur = rte_pktmbuf_alloc(mbuf_pool);
if (cur == NULL) {
rte_pktmbuf_free(p_head);
return NULL;
}
}
ff_next_mbuf(&p_bsdbuf, &data, &len); // p_bsdbuf move to next mbuf.
cur->buf_addr = data;
cur->buf_physaddr = ff_mem_virt2phy((const void*)(cur->buf_addr));
cur->data_off = 0;
cur->data_len = len;
p_head->nb_segs++;
if (prev != NULL) {
prev->next = cur;
}
prev = cur;
cur = NULL;
}
return p_head;
}
int ff_dpdk_if_send_ex(struct ff_dpdk_if_context *ctx, void *m, int total)
{
struct rte_mbuf *head = NULL;
void *src_buf = NULL;
void* p_data = NULL;
struct lcore_conf *qconf = NULL;
unsigned len = 0;
if ( !m )
{
rte_log(RTE_LOG_CRIT, RTE_LOGTYPE_USER1, "ff_dpdk_if_send_ex input invalid NULL address.");
return -1;
}
p_data = ff_mbuf_mtod(m);
if ( ff_chk_vma((uint64_t)p_data))
{
head = ff_bsd_to_rte(m, total);
}
else if( ff_extcl_to_rte(m) ==NULL )
{
rte_panic("data address 0x%lx is out of page bound or malloc by DPDK recver.", (uint64_t)p_data);
return -1;
}
if (head == NULL)
{
rte_log(RTE_LOG_CRIT, RTE_LOGTYPE_USER1, "ff_dpdk_if_send_ex call ff_bsd_to_rte failed.");
ff_mbuf_free(m);
return -1;
}
ff_offload_set(ctx, m, head);
qconf = &lcore_conf;
len = qconf->tx_mbufs[ctx->port_id].len;
qconf->tx_mbufs[ctx->port_id].m_table[len] = head;
qconf->tx_mbufs[ctx->port_id].bsd_m_table[len] = m;
len++;
/* enough pkts to be sent */
if (unlikely(len == MAX_PKT_BURST)) {
send_burst(qconf, MAX_PKT_BURST, ctx->port_id);
len = 0;
}
qconf->tx_mbufs[ctx->port_id].len = len;
return 0;
}
#endif

View File

@ -46,6 +46,8 @@
#include "ff_errno.h"
static struct timespec current_ts;
extern void* ff_mem_get_page();
extern int ff_mem_free_addr(void* p);
void *
ff_mmap(void *addr, uint64_t len, int prot, int flags, int fd, uint64_t offset)
@ -54,6 +56,15 @@ ff_mmap(void *addr, uint64_t len, int prot, int flags, int fd, uint64_t offset)
int host_prot;
int host_flags;
#ifdef _USE_PAGE_ARRAY_
if( len == 4096 )
{
return ff_mem_get_page();
}
else
#endif
{
assert(ff_PROT_NONE == PROT_NONE);
host_prot = 0;
if ((prot & ff_PROT_READ) == ff_PROT_READ) host_prot |= PROT_READ;
@ -71,11 +82,18 @@ ff_mmap(void *addr, uint64_t len, int prot, int flags, int fd, uint64_t offset)
exit(1);
}
return ret;
}
}
int
ff_munmap(void *addr, uint64_t len)
{
#ifdef _USE_PAGE_ARRAY_
if ( len == 4096 )
{
return ff_mem_free_addr(addr);
}
#endif
//rte_free(addr);
//return 0;
return (munmap(addr, len));

View File

@ -420,3 +420,42 @@ ff_veth_softc_to_hostc(void *softc)
return (void *)sc->host_ctx;
}
/********************
* get next mbuf's addr, current mbuf's data and datalen.
*
********************/
int ff_next_mbuf(void **mbuf_bsd, void **data, unsigned *len)
{
struct mbuf *mb = *(struct mbuf **)mbuf_bsd;
*len = mb->m_len;
*data = mb->m_data;
if (mb->m_next)
*mbuf_bsd = mb->m_next;
else
*mbuf_bsd = NULL;
return 0;
}
void * ff_mbuf_mtod(void* bsd_mbuf)
{
if ( !bsd_mbuf )
return NULL;
return (void*)((struct mbuf *)bsd_mbuf)->m_data;
}
// get source rte_mbuf from ext cluster, which carry rte_mbuf while recving pkt, such as arp.
void* ff_rte_frm_extcl(void* mbuf)
{
struct mbuf* bsd_mbuf = mbuf;
if ( bsd_mbuf->m_ext.ext_type==EXT_DISPOSABLE && bsd_mbuf->m_ext.ext_free==ff_mbuf_ext_free )
{
return bsd_mbuf->m_ext.ext_arg1;
}
else
return NULL;
}

View File

@ -37,6 +37,9 @@ void *ff_mbuf_get(void *m, void *data, uint16_t len);
void ff_mbuf_free(void *m);
int ff_mbuf_copydata(void *m, void *data, int off, int len);
int ff_next_mbuf(void **mbuf_bsd, void **data, unsigned *len);
void* ff_mbuf_mtod(void* bsd_mbuf);
void* ff_rte_frm_extcl(void* mbuf);
struct ff_tx_offload;
void ff_mbuf_tx_offload(void *m, struct ff_tx_offload *offload);