mirror of https://github.com/F-Stack/f-stack.git
510 lines
13 KiB
C
510 lines
13 KiB
C
/* SPDX-License-Identifier: BSD-3-Clause
|
|
* Copyright(c) 2023 Intel Corporation
|
|
*/
|
|
|
|
#include <inttypes.h>
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <unistd.h>
|
|
|
|
#include <rte_time.h>
|
|
#include <rte_mbuf.h>
|
|
#include <rte_dmadev.h>
|
|
#include <rte_malloc.h>
|
|
#include <rte_lcore.h>
|
|
|
|
#include "main.h"
|
|
|
|
#define MAX_DMA_CPL_NB 255
|
|
|
|
#define TEST_WAIT_U_SECOND 10000
|
|
#define POLL_MAX 1000
|
|
|
|
#define CSV_LINE_DMA_FMT "Scenario %u,%u,%s,%u,%u,%u,%u,%.2lf,%" PRIu64 ",%.3lf,%.3lf\n"
|
|
#define CSV_LINE_CPU_FMT "Scenario %u,%u,NA,NA,NA,%u,%u,%.2lf,%" PRIu64 ",%.3lf,%.3lf\n"
|
|
|
|
#define CSV_TOTAL_LINE_FMT "Scenario %u Summary, , , , , ,%u,%.2lf,%u,%.3lf,%.3lf\n"
|
|
|
|
struct worker_info {
|
|
bool ready_flag;
|
|
bool start_flag;
|
|
bool stop_flag;
|
|
uint32_t total_cpl;
|
|
uint32_t test_cpl;
|
|
};
|
|
|
|
struct lcore_params {
|
|
uint8_t scenario_id;
|
|
unsigned int lcore_id;
|
|
char *dma_name;
|
|
uint16_t worker_id;
|
|
uint16_t dev_id;
|
|
uint32_t nr_buf;
|
|
uint16_t kick_batch;
|
|
uint32_t buf_size;
|
|
uint16_t test_secs;
|
|
struct rte_mbuf **srcs;
|
|
struct rte_mbuf **dsts;
|
|
volatile struct worker_info worker_info;
|
|
};
|
|
|
|
static struct rte_mempool *src_pool;
|
|
static struct rte_mempool *dst_pool;
|
|
|
|
static struct lcore_params *lcores[MAX_WORKER_NB];
|
|
|
|
#define PRINT_ERR(...) print_err(__func__, __LINE__, __VA_ARGS__)
|
|
|
|
static inline int
|
|
__rte_format_printf(3, 4)
|
|
print_err(const char *func, int lineno, const char *format, ...)
|
|
{
|
|
va_list ap;
|
|
int ret;
|
|
|
|
ret = fprintf(stderr, "In %s:%d - ", func, lineno);
|
|
va_start(ap, format);
|
|
ret += vfprintf(stderr, format, ap);
|
|
va_end(ap);
|
|
|
|
return ret;
|
|
}
|
|
|
|
static inline void
|
|
calc_result(uint32_t buf_size, uint32_t nr_buf, uint16_t nb_workers, uint16_t test_secs,
|
|
uint32_t total_cnt, float *memory, uint32_t *ave_cycle,
|
|
float *bandwidth, float *mops)
|
|
{
|
|
float ops;
|
|
|
|
*memory = (float)(buf_size * (nr_buf / nb_workers) * 2) / (1024 * 1024);
|
|
*ave_cycle = test_secs * rte_get_timer_hz() / total_cnt;
|
|
ops = (float)total_cnt / test_secs;
|
|
*mops = ops / (1000 * 1000);
|
|
*bandwidth = (ops * buf_size * 8) / (1000 * 1000 * 1000);
|
|
}
|
|
|
|
static void
|
|
output_result(uint8_t scenario_id, uint32_t lcore_id, char *dma_name, uint16_t ring_size,
|
|
uint16_t kick_batch, uint64_t ave_cycle, uint32_t buf_size, uint32_t nr_buf,
|
|
float memory, float bandwidth, float mops, bool is_dma)
|
|
{
|
|
if (is_dma)
|
|
printf("lcore %u, DMA %s, DMA Ring Size: %u, Kick Batch Size: %u.\n",
|
|
lcore_id, dma_name, ring_size, kick_batch);
|
|
else
|
|
printf("lcore %u\n", lcore_id);
|
|
|
|
printf("Average Cycles/op: %" PRIu64 ", Buffer Size: %u B, Buffer Number: %u, Memory: %.2lf MB, Frequency: %.3lf Ghz.\n",
|
|
ave_cycle, buf_size, nr_buf, memory, rte_get_timer_hz()/1000000000.0);
|
|
printf("Average Bandwidth: %.3lf Gbps, MOps: %.3lf\n", bandwidth, mops);
|
|
|
|
if (is_dma)
|
|
snprintf(output_str[lcore_id], MAX_OUTPUT_STR_LEN, CSV_LINE_DMA_FMT,
|
|
scenario_id, lcore_id, dma_name, ring_size, kick_batch, buf_size,
|
|
nr_buf, memory, ave_cycle, bandwidth, mops);
|
|
else
|
|
snprintf(output_str[lcore_id], MAX_OUTPUT_STR_LEN, CSV_LINE_CPU_FMT,
|
|
scenario_id, lcore_id, buf_size,
|
|
nr_buf, memory, ave_cycle, bandwidth, mops);
|
|
}
|
|
|
|
static inline void
|
|
cache_flush_buf(__rte_unused struct rte_mbuf **array,
|
|
__rte_unused uint32_t buf_size,
|
|
__rte_unused uint32_t nr_buf)
|
|
{
|
|
#ifdef RTE_ARCH_X86_64
|
|
char *data;
|
|
struct rte_mbuf **srcs = array;
|
|
uint32_t i, offset;
|
|
|
|
for (i = 0; i < nr_buf; i++) {
|
|
data = rte_pktmbuf_mtod(srcs[i], char *);
|
|
for (offset = 0; offset < buf_size; offset += 64)
|
|
__builtin_ia32_clflush(data + offset);
|
|
}
|
|
#endif
|
|
}
|
|
|
|
/* Configuration of device. */
|
|
static void
|
|
configure_dmadev_queue(uint32_t dev_id, uint32_t ring_size)
|
|
{
|
|
uint16_t vchan = 0;
|
|
struct rte_dma_info info;
|
|
struct rte_dma_conf dev_config = { .nb_vchans = 1 };
|
|
struct rte_dma_vchan_conf qconf = {
|
|
.direction = RTE_DMA_DIR_MEM_TO_MEM,
|
|
.nb_desc = ring_size
|
|
};
|
|
|
|
if (rte_dma_configure(dev_id, &dev_config) != 0)
|
|
rte_exit(EXIT_FAILURE, "Error with dma configure.\n");
|
|
|
|
if (rte_dma_vchan_setup(dev_id, vchan, &qconf) != 0)
|
|
rte_exit(EXIT_FAILURE, "Error with queue configuration.\n");
|
|
|
|
if (rte_dma_info_get(dev_id, &info) != 0)
|
|
rte_exit(EXIT_FAILURE, "Error with getting device info.\n");
|
|
|
|
if (info.nb_vchans != 1)
|
|
rte_exit(EXIT_FAILURE, "Error, no configured queues reported on device id. %u\n",
|
|
dev_id);
|
|
|
|
if (rte_dma_start(dev_id) != 0)
|
|
rte_exit(EXIT_FAILURE, "Error with dma start.\n");
|
|
}
|
|
|
|
static int
|
|
config_dmadevs(struct test_configure *cfg)
|
|
{
|
|
uint32_t ring_size = cfg->ring_size.cur;
|
|
struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
|
|
uint32_t nb_workers = ldm->cnt;
|
|
uint32_t i;
|
|
int dev_id;
|
|
uint16_t nb_dmadevs = 0;
|
|
char *dma_name;
|
|
|
|
for (i = 0; i < ldm->cnt; i++) {
|
|
dma_name = ldm->dma_names[i];
|
|
dev_id = rte_dma_get_dev_id_by_name(dma_name);
|
|
if (dev_id < 0) {
|
|
fprintf(stderr, "Error: Fail to find DMA %s.\n", dma_name);
|
|
goto end;
|
|
}
|
|
|
|
ldm->dma_ids[i] = dev_id;
|
|
configure_dmadev_queue(dev_id, ring_size);
|
|
++nb_dmadevs;
|
|
}
|
|
|
|
end:
|
|
if (nb_dmadevs < nb_workers) {
|
|
printf("Not enough dmadevs (%u) for all workers (%u).\n", nb_dmadevs, nb_workers);
|
|
return -1;
|
|
}
|
|
|
|
printf("Number of used dmadevs: %u.\n", nb_dmadevs);
|
|
|
|
return 0;
|
|
}
|
|
|
|
static void
|
|
error_exit(int dev_id)
|
|
{
|
|
rte_dma_stop(dev_id);
|
|
rte_dma_close(dev_id);
|
|
rte_exit(EXIT_FAILURE, "DMA error\n");
|
|
}
|
|
|
|
static inline void
|
|
do_dma_submit_and_poll(uint16_t dev_id, uint64_t *async_cnt,
|
|
volatile struct worker_info *worker_info)
|
|
{
|
|
int ret;
|
|
uint16_t nr_cpl;
|
|
|
|
ret = rte_dma_submit(dev_id, 0);
|
|
if (ret < 0)
|
|
error_exit(dev_id);
|
|
|
|
nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB, NULL, NULL);
|
|
*async_cnt -= nr_cpl;
|
|
worker_info->total_cpl += nr_cpl;
|
|
}
|
|
|
|
static inline int
|
|
do_dma_mem_copy(void *p)
|
|
{
|
|
struct lcore_params *para = (struct lcore_params *)p;
|
|
volatile struct worker_info *worker_info = &(para->worker_info);
|
|
const uint16_t dev_id = para->dev_id;
|
|
const uint32_t nr_buf = para->nr_buf;
|
|
const uint16_t kick_batch = para->kick_batch;
|
|
const uint32_t buf_size = para->buf_size;
|
|
struct rte_mbuf **srcs = para->srcs;
|
|
struct rte_mbuf **dsts = para->dsts;
|
|
uint16_t nr_cpl;
|
|
uint64_t async_cnt = 0;
|
|
uint32_t i;
|
|
uint32_t poll_cnt = 0;
|
|
int ret;
|
|
|
|
worker_info->stop_flag = false;
|
|
worker_info->ready_flag = true;
|
|
|
|
while (!worker_info->start_flag)
|
|
;
|
|
|
|
while (1) {
|
|
for (i = 0; i < nr_buf; i++) {
|
|
dma_copy:
|
|
ret = rte_dma_copy(dev_id, 0, rte_mbuf_data_iova(srcs[i]),
|
|
rte_mbuf_data_iova(dsts[i]), buf_size, 0);
|
|
if (unlikely(ret < 0)) {
|
|
if (ret == -ENOSPC) {
|
|
do_dma_submit_and_poll(dev_id, &async_cnt, worker_info);
|
|
goto dma_copy;
|
|
} else
|
|
error_exit(dev_id);
|
|
}
|
|
async_cnt++;
|
|
|
|
if ((async_cnt % kick_batch) == 0)
|
|
do_dma_submit_and_poll(dev_id, &async_cnt, worker_info);
|
|
}
|
|
|
|
if (worker_info->stop_flag)
|
|
break;
|
|
}
|
|
|
|
rte_dma_submit(dev_id, 0);
|
|
while ((async_cnt > 0) && (poll_cnt++ < POLL_MAX)) {
|
|
nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB, NULL, NULL);
|
|
async_cnt -= nr_cpl;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static inline int
|
|
do_cpu_mem_copy(void *p)
|
|
{
|
|
struct lcore_params *para = (struct lcore_params *)p;
|
|
volatile struct worker_info *worker_info = &(para->worker_info);
|
|
const uint32_t nr_buf = para->nr_buf;
|
|
const uint32_t buf_size = para->buf_size;
|
|
struct rte_mbuf **srcs = para->srcs;
|
|
struct rte_mbuf **dsts = para->dsts;
|
|
uint32_t i;
|
|
|
|
worker_info->stop_flag = false;
|
|
worker_info->ready_flag = true;
|
|
|
|
while (!worker_info->start_flag)
|
|
;
|
|
|
|
while (1) {
|
|
for (i = 0; i < nr_buf; i++) {
|
|
const void *src = rte_pktmbuf_mtod(dsts[i], void *);
|
|
void *dst = rte_pktmbuf_mtod(srcs[i], void *);
|
|
|
|
/* copy buffer form src to dst */
|
|
rte_memcpy(dst, src, (size_t)buf_size);
|
|
worker_info->total_cpl++;
|
|
}
|
|
if (worker_info->stop_flag)
|
|
break;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int
|
|
setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
|
|
struct rte_mbuf ***dsts)
|
|
{
|
|
unsigned int buf_size = cfg->buf_size.cur;
|
|
unsigned int nr_sockets;
|
|
uint32_t nr_buf = cfg->nr_buf;
|
|
|
|
nr_sockets = rte_socket_count();
|
|
if (cfg->src_numa_node >= nr_sockets ||
|
|
cfg->dst_numa_node >= nr_sockets) {
|
|
printf("Error: Source or destination numa exceeds the acture numa nodes.\n");
|
|
return -1;
|
|
}
|
|
|
|
src_pool = rte_pktmbuf_pool_create("Benchmark_DMA_SRC",
|
|
nr_buf,
|
|
0,
|
|
0,
|
|
buf_size + RTE_PKTMBUF_HEADROOM,
|
|
cfg->src_numa_node);
|
|
if (src_pool == NULL) {
|
|
PRINT_ERR("Error with source mempool creation.\n");
|
|
return -1;
|
|
}
|
|
|
|
dst_pool = rte_pktmbuf_pool_create("Benchmark_DMA_DST",
|
|
nr_buf,
|
|
0,
|
|
0,
|
|
buf_size + RTE_PKTMBUF_HEADROOM,
|
|
cfg->dst_numa_node);
|
|
if (dst_pool == NULL) {
|
|
PRINT_ERR("Error with destination mempool creation.\n");
|
|
return -1;
|
|
}
|
|
|
|
*srcs = rte_malloc(NULL, nr_buf * sizeof(struct rte_mbuf *), 0);
|
|
if (*srcs == NULL) {
|
|
printf("Error: srcs malloc failed.\n");
|
|
return -1;
|
|
}
|
|
|
|
*dsts = rte_malloc(NULL, nr_buf * sizeof(struct rte_mbuf *), 0);
|
|
if (*dsts == NULL) {
|
|
printf("Error: dsts malloc failed.\n");
|
|
return -1;
|
|
}
|
|
|
|
if (rte_pktmbuf_alloc_bulk(src_pool, *srcs, nr_buf) != 0) {
|
|
printf("alloc src mbufs failed.\n");
|
|
return -1;
|
|
}
|
|
|
|
if (rte_pktmbuf_alloc_bulk(dst_pool, *dsts, nr_buf) != 0) {
|
|
printf("alloc dst mbufs failed.\n");
|
|
return -1;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
void
|
|
mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
|
|
{
|
|
uint16_t i;
|
|
uint32_t offset;
|
|
unsigned int lcore_id = 0;
|
|
struct rte_mbuf **srcs = NULL, **dsts = NULL;
|
|
struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
|
|
unsigned int buf_size = cfg->buf_size.cur;
|
|
uint16_t kick_batch = cfg->kick_batch.cur;
|
|
uint32_t nr_buf = cfg->nr_buf = (cfg->mem_size.cur * 1024 * 1024) / (cfg->buf_size.cur * 2);
|
|
uint16_t nb_workers = ldm->cnt;
|
|
uint16_t test_secs = cfg->test_secs;
|
|
float memory = 0;
|
|
uint32_t avg_cycles = 0;
|
|
uint32_t avg_cycles_total;
|
|
float mops, mops_total;
|
|
float bandwidth, bandwidth_total;
|
|
|
|
if (setup_memory_env(cfg, &srcs, &dsts) < 0)
|
|
goto out;
|
|
|
|
if (is_dma)
|
|
if (config_dmadevs(cfg) < 0)
|
|
goto out;
|
|
|
|
if (cfg->cache_flush == 1) {
|
|
cache_flush_buf(srcs, buf_size, nr_buf);
|
|
cache_flush_buf(dsts, buf_size, nr_buf);
|
|
rte_mb();
|
|
}
|
|
|
|
printf("Start testing....\n");
|
|
|
|
for (i = 0; i < nb_workers; i++) {
|
|
lcore_id = ldm->lcores[i];
|
|
offset = nr_buf / nb_workers * i;
|
|
lcores[i] = rte_malloc(NULL, sizeof(struct lcore_params), 0);
|
|
if (lcores[i] == NULL) {
|
|
printf("lcore parameters malloc failure for lcore %d\n", lcore_id);
|
|
break;
|
|
}
|
|
if (is_dma) {
|
|
lcores[i]->dma_name = ldm->dma_names[i];
|
|
lcores[i]->dev_id = ldm->dma_ids[i];
|
|
lcores[i]->kick_batch = kick_batch;
|
|
}
|
|
lcores[i]->worker_id = i;
|
|
lcores[i]->nr_buf = (uint32_t)(nr_buf / nb_workers);
|
|
lcores[i]->buf_size = buf_size;
|
|
lcores[i]->test_secs = test_secs;
|
|
lcores[i]->srcs = srcs + offset;
|
|
lcores[i]->dsts = dsts + offset;
|
|
lcores[i]->scenario_id = cfg->scenario_id;
|
|
lcores[i]->lcore_id = lcore_id;
|
|
|
|
if (is_dma)
|
|
rte_eal_remote_launch(do_dma_mem_copy, (void *)(lcores[i]), lcore_id);
|
|
else
|
|
rte_eal_remote_launch(do_cpu_mem_copy, (void *)(lcores[i]), lcore_id);
|
|
}
|
|
|
|
while (1) {
|
|
bool ready = true;
|
|
for (i = 0; i < nb_workers; i++) {
|
|
if (lcores[i]->worker_info.ready_flag == false) {
|
|
ready = 0;
|
|
break;
|
|
}
|
|
}
|
|
if (ready)
|
|
break;
|
|
}
|
|
|
|
for (i = 0; i < nb_workers; i++)
|
|
lcores[i]->worker_info.start_flag = true;
|
|
|
|
usleep(TEST_WAIT_U_SECOND);
|
|
for (i = 0; i < nb_workers; i++)
|
|
lcores[i]->worker_info.test_cpl = lcores[i]->worker_info.total_cpl;
|
|
|
|
usleep(test_secs * 1000 * 1000);
|
|
for (i = 0; i < nb_workers; i++)
|
|
lcores[i]->worker_info.test_cpl = lcores[i]->worker_info.total_cpl -
|
|
lcores[i]->worker_info.test_cpl;
|
|
|
|
for (i = 0; i < nb_workers; i++)
|
|
lcores[i]->worker_info.stop_flag = true;
|
|
|
|
rte_eal_mp_wait_lcore();
|
|
|
|
mops_total = 0;
|
|
bandwidth_total = 0;
|
|
avg_cycles_total = 0;
|
|
for (i = 0; i < nb_workers; i++) {
|
|
calc_result(buf_size, nr_buf, nb_workers, test_secs,
|
|
lcores[i]->worker_info.test_cpl,
|
|
&memory, &avg_cycles, &bandwidth, &mops);
|
|
output_result(cfg->scenario_id, lcores[i]->lcore_id,
|
|
lcores[i]->dma_name, cfg->ring_size.cur, kick_batch,
|
|
avg_cycles, buf_size, nr_buf / nb_workers, memory,
|
|
bandwidth, mops, is_dma);
|
|
mops_total += mops;
|
|
bandwidth_total += bandwidth;
|
|
avg_cycles_total += avg_cycles;
|
|
}
|
|
printf("\nTotal Bandwidth: %.3lf Gbps, Total MOps: %.3lf\n", bandwidth_total, mops_total);
|
|
snprintf(output_str[MAX_WORKER_NB], MAX_OUTPUT_STR_LEN, CSV_TOTAL_LINE_FMT,
|
|
cfg->scenario_id, nr_buf, memory * nb_workers,
|
|
avg_cycles_total / nb_workers, bandwidth_total, mops_total);
|
|
|
|
out:
|
|
/* free mbufs used in the test */
|
|
if (srcs != NULL)
|
|
rte_pktmbuf_free_bulk(srcs, nr_buf);
|
|
if (dsts != NULL)
|
|
rte_pktmbuf_free_bulk(dsts, nr_buf);
|
|
|
|
/* free the points for the mbufs */
|
|
rte_free(srcs);
|
|
srcs = NULL;
|
|
rte_free(dsts);
|
|
dsts = NULL;
|
|
|
|
rte_mempool_free(src_pool);
|
|
src_pool = NULL;
|
|
|
|
rte_mempool_free(dst_pool);
|
|
dst_pool = NULL;
|
|
|
|
/* free the worker parameters */
|
|
for (i = 0; i < nb_workers; i++) {
|
|
rte_free(lcores[i]);
|
|
lcores[i] = NULL;
|
|
}
|
|
|
|
if (is_dma) {
|
|
for (i = 0; i < nb_workers; i++) {
|
|
printf("Stopping dmadev %d\n", ldm->dma_ids[i]);
|
|
rte_dma_stop(ldm->dma_ids[i]);
|
|
}
|
|
}
|
|
}
|