f-stack/dpdk/app/test-dma-perf/benchmark.c

510 lines
13 KiB
C

/* SPDX-License-Identifier: BSD-3-Clause
* Copyright(c) 2023 Intel Corporation
*/
#include <inttypes.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <rte_time.h>
#include <rte_mbuf.h>
#include <rte_dmadev.h>
#include <rte_malloc.h>
#include <rte_lcore.h>
#include "main.h"
#define MAX_DMA_CPL_NB 255
#define TEST_WAIT_U_SECOND 10000
#define POLL_MAX 1000
#define CSV_LINE_DMA_FMT "Scenario %u,%u,%s,%u,%u,%u,%u,%.2lf,%" PRIu64 ",%.3lf,%.3lf\n"
#define CSV_LINE_CPU_FMT "Scenario %u,%u,NA,NA,NA,%u,%u,%.2lf,%" PRIu64 ",%.3lf,%.3lf\n"
#define CSV_TOTAL_LINE_FMT "Scenario %u Summary, , , , , ,%u,%.2lf,%u,%.3lf,%.3lf\n"
struct worker_info {
bool ready_flag;
bool start_flag;
bool stop_flag;
uint32_t total_cpl;
uint32_t test_cpl;
};
struct lcore_params {
uint8_t scenario_id;
unsigned int lcore_id;
char *dma_name;
uint16_t worker_id;
uint16_t dev_id;
uint32_t nr_buf;
uint16_t kick_batch;
uint32_t buf_size;
uint16_t test_secs;
struct rte_mbuf **srcs;
struct rte_mbuf **dsts;
volatile struct worker_info worker_info;
};
static struct rte_mempool *src_pool;
static struct rte_mempool *dst_pool;
static struct lcore_params *lcores[MAX_WORKER_NB];
#define PRINT_ERR(...) print_err(__func__, __LINE__, __VA_ARGS__)
static inline int
__rte_format_printf(3, 4)
print_err(const char *func, int lineno, const char *format, ...)
{
va_list ap;
int ret;
ret = fprintf(stderr, "In %s:%d - ", func, lineno);
va_start(ap, format);
ret += vfprintf(stderr, format, ap);
va_end(ap);
return ret;
}
static inline void
calc_result(uint32_t buf_size, uint32_t nr_buf, uint16_t nb_workers, uint16_t test_secs,
uint32_t total_cnt, float *memory, uint32_t *ave_cycle,
float *bandwidth, float *mops)
{
float ops;
*memory = (float)(buf_size * (nr_buf / nb_workers) * 2) / (1024 * 1024);
*ave_cycle = test_secs * rte_get_timer_hz() / total_cnt;
ops = (float)total_cnt / test_secs;
*mops = ops / (1000 * 1000);
*bandwidth = (ops * buf_size * 8) / (1000 * 1000 * 1000);
}
static void
output_result(uint8_t scenario_id, uint32_t lcore_id, char *dma_name, uint16_t ring_size,
uint16_t kick_batch, uint64_t ave_cycle, uint32_t buf_size, uint32_t nr_buf,
float memory, float bandwidth, float mops, bool is_dma)
{
if (is_dma)
printf("lcore %u, DMA %s, DMA Ring Size: %u, Kick Batch Size: %u.\n",
lcore_id, dma_name, ring_size, kick_batch);
else
printf("lcore %u\n", lcore_id);
printf("Average Cycles/op: %" PRIu64 ", Buffer Size: %u B, Buffer Number: %u, Memory: %.2lf MB, Frequency: %.3lf Ghz.\n",
ave_cycle, buf_size, nr_buf, memory, rte_get_timer_hz()/1000000000.0);
printf("Average Bandwidth: %.3lf Gbps, MOps: %.3lf\n", bandwidth, mops);
if (is_dma)
snprintf(output_str[lcore_id], MAX_OUTPUT_STR_LEN, CSV_LINE_DMA_FMT,
scenario_id, lcore_id, dma_name, ring_size, kick_batch, buf_size,
nr_buf, memory, ave_cycle, bandwidth, mops);
else
snprintf(output_str[lcore_id], MAX_OUTPUT_STR_LEN, CSV_LINE_CPU_FMT,
scenario_id, lcore_id, buf_size,
nr_buf, memory, ave_cycle, bandwidth, mops);
}
static inline void
cache_flush_buf(__rte_unused struct rte_mbuf **array,
__rte_unused uint32_t buf_size,
__rte_unused uint32_t nr_buf)
{
#ifdef RTE_ARCH_X86_64
char *data;
struct rte_mbuf **srcs = array;
uint32_t i, offset;
for (i = 0; i < nr_buf; i++) {
data = rte_pktmbuf_mtod(srcs[i], char *);
for (offset = 0; offset < buf_size; offset += 64)
__builtin_ia32_clflush(data + offset);
}
#endif
}
/* Configuration of device. */
static void
configure_dmadev_queue(uint32_t dev_id, uint32_t ring_size)
{
uint16_t vchan = 0;
struct rte_dma_info info;
struct rte_dma_conf dev_config = { .nb_vchans = 1 };
struct rte_dma_vchan_conf qconf = {
.direction = RTE_DMA_DIR_MEM_TO_MEM,
.nb_desc = ring_size
};
if (rte_dma_configure(dev_id, &dev_config) != 0)
rte_exit(EXIT_FAILURE, "Error with dma configure.\n");
if (rte_dma_vchan_setup(dev_id, vchan, &qconf) != 0)
rte_exit(EXIT_FAILURE, "Error with queue configuration.\n");
if (rte_dma_info_get(dev_id, &info) != 0)
rte_exit(EXIT_FAILURE, "Error with getting device info.\n");
if (info.nb_vchans != 1)
rte_exit(EXIT_FAILURE, "Error, no configured queues reported on device id. %u\n",
dev_id);
if (rte_dma_start(dev_id) != 0)
rte_exit(EXIT_FAILURE, "Error with dma start.\n");
}
static int
config_dmadevs(struct test_configure *cfg)
{
uint32_t ring_size = cfg->ring_size.cur;
struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
uint32_t nb_workers = ldm->cnt;
uint32_t i;
int dev_id;
uint16_t nb_dmadevs = 0;
char *dma_name;
for (i = 0; i < ldm->cnt; i++) {
dma_name = ldm->dma_names[i];
dev_id = rte_dma_get_dev_id_by_name(dma_name);
if (dev_id < 0) {
fprintf(stderr, "Error: Fail to find DMA %s.\n", dma_name);
goto end;
}
ldm->dma_ids[i] = dev_id;
configure_dmadev_queue(dev_id, ring_size);
++nb_dmadevs;
}
end:
if (nb_dmadevs < nb_workers) {
printf("Not enough dmadevs (%u) for all workers (%u).\n", nb_dmadevs, nb_workers);
return -1;
}
printf("Number of used dmadevs: %u.\n", nb_dmadevs);
return 0;
}
static void
error_exit(int dev_id)
{
rte_dma_stop(dev_id);
rte_dma_close(dev_id);
rte_exit(EXIT_FAILURE, "DMA error\n");
}
static inline void
do_dma_submit_and_poll(uint16_t dev_id, uint64_t *async_cnt,
volatile struct worker_info *worker_info)
{
int ret;
uint16_t nr_cpl;
ret = rte_dma_submit(dev_id, 0);
if (ret < 0)
error_exit(dev_id);
nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB, NULL, NULL);
*async_cnt -= nr_cpl;
worker_info->total_cpl += nr_cpl;
}
static inline int
do_dma_mem_copy(void *p)
{
struct lcore_params *para = (struct lcore_params *)p;
volatile struct worker_info *worker_info = &(para->worker_info);
const uint16_t dev_id = para->dev_id;
const uint32_t nr_buf = para->nr_buf;
const uint16_t kick_batch = para->kick_batch;
const uint32_t buf_size = para->buf_size;
struct rte_mbuf **srcs = para->srcs;
struct rte_mbuf **dsts = para->dsts;
uint16_t nr_cpl;
uint64_t async_cnt = 0;
uint32_t i;
uint32_t poll_cnt = 0;
int ret;
worker_info->stop_flag = false;
worker_info->ready_flag = true;
while (!worker_info->start_flag)
;
while (1) {
for (i = 0; i < nr_buf; i++) {
dma_copy:
ret = rte_dma_copy(dev_id, 0, rte_mbuf_data_iova(srcs[i]),
rte_mbuf_data_iova(dsts[i]), buf_size, 0);
if (unlikely(ret < 0)) {
if (ret == -ENOSPC) {
do_dma_submit_and_poll(dev_id, &async_cnt, worker_info);
goto dma_copy;
} else
error_exit(dev_id);
}
async_cnt++;
if ((async_cnt % kick_batch) == 0)
do_dma_submit_and_poll(dev_id, &async_cnt, worker_info);
}
if (worker_info->stop_flag)
break;
}
rte_dma_submit(dev_id, 0);
while ((async_cnt > 0) && (poll_cnt++ < POLL_MAX)) {
nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB, NULL, NULL);
async_cnt -= nr_cpl;
}
return 0;
}
static inline int
do_cpu_mem_copy(void *p)
{
struct lcore_params *para = (struct lcore_params *)p;
volatile struct worker_info *worker_info = &(para->worker_info);
const uint32_t nr_buf = para->nr_buf;
const uint32_t buf_size = para->buf_size;
struct rte_mbuf **srcs = para->srcs;
struct rte_mbuf **dsts = para->dsts;
uint32_t i;
worker_info->stop_flag = false;
worker_info->ready_flag = true;
while (!worker_info->start_flag)
;
while (1) {
for (i = 0; i < nr_buf; i++) {
const void *src = rte_pktmbuf_mtod(dsts[i], void *);
void *dst = rte_pktmbuf_mtod(srcs[i], void *);
/* copy buffer form src to dst */
rte_memcpy(dst, src, (size_t)buf_size);
worker_info->total_cpl++;
}
if (worker_info->stop_flag)
break;
}
return 0;
}
static int
setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs,
struct rte_mbuf ***dsts)
{
unsigned int buf_size = cfg->buf_size.cur;
unsigned int nr_sockets;
uint32_t nr_buf = cfg->nr_buf;
nr_sockets = rte_socket_count();
if (cfg->src_numa_node >= nr_sockets ||
cfg->dst_numa_node >= nr_sockets) {
printf("Error: Source or destination numa exceeds the acture numa nodes.\n");
return -1;
}
src_pool = rte_pktmbuf_pool_create("Benchmark_DMA_SRC",
nr_buf,
0,
0,
buf_size + RTE_PKTMBUF_HEADROOM,
cfg->src_numa_node);
if (src_pool == NULL) {
PRINT_ERR("Error with source mempool creation.\n");
return -1;
}
dst_pool = rte_pktmbuf_pool_create("Benchmark_DMA_DST",
nr_buf,
0,
0,
buf_size + RTE_PKTMBUF_HEADROOM,
cfg->dst_numa_node);
if (dst_pool == NULL) {
PRINT_ERR("Error with destination mempool creation.\n");
return -1;
}
*srcs = rte_malloc(NULL, nr_buf * sizeof(struct rte_mbuf *), 0);
if (*srcs == NULL) {
printf("Error: srcs malloc failed.\n");
return -1;
}
*dsts = rte_malloc(NULL, nr_buf * sizeof(struct rte_mbuf *), 0);
if (*dsts == NULL) {
printf("Error: dsts malloc failed.\n");
return -1;
}
if (rte_pktmbuf_alloc_bulk(src_pool, *srcs, nr_buf) != 0) {
printf("alloc src mbufs failed.\n");
return -1;
}
if (rte_pktmbuf_alloc_bulk(dst_pool, *dsts, nr_buf) != 0) {
printf("alloc dst mbufs failed.\n");
return -1;
}
return 0;
}
void
mem_copy_benchmark(struct test_configure *cfg, bool is_dma)
{
uint16_t i;
uint32_t offset;
unsigned int lcore_id = 0;
struct rte_mbuf **srcs = NULL, **dsts = NULL;
struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map;
unsigned int buf_size = cfg->buf_size.cur;
uint16_t kick_batch = cfg->kick_batch.cur;
uint32_t nr_buf = cfg->nr_buf = (cfg->mem_size.cur * 1024 * 1024) / (cfg->buf_size.cur * 2);
uint16_t nb_workers = ldm->cnt;
uint16_t test_secs = cfg->test_secs;
float memory = 0;
uint32_t avg_cycles = 0;
uint32_t avg_cycles_total;
float mops, mops_total;
float bandwidth, bandwidth_total;
if (setup_memory_env(cfg, &srcs, &dsts) < 0)
goto out;
if (is_dma)
if (config_dmadevs(cfg) < 0)
goto out;
if (cfg->cache_flush == 1) {
cache_flush_buf(srcs, buf_size, nr_buf);
cache_flush_buf(dsts, buf_size, nr_buf);
rte_mb();
}
printf("Start testing....\n");
for (i = 0; i < nb_workers; i++) {
lcore_id = ldm->lcores[i];
offset = nr_buf / nb_workers * i;
lcores[i] = rte_malloc(NULL, sizeof(struct lcore_params), 0);
if (lcores[i] == NULL) {
printf("lcore parameters malloc failure for lcore %d\n", lcore_id);
break;
}
if (is_dma) {
lcores[i]->dma_name = ldm->dma_names[i];
lcores[i]->dev_id = ldm->dma_ids[i];
lcores[i]->kick_batch = kick_batch;
}
lcores[i]->worker_id = i;
lcores[i]->nr_buf = (uint32_t)(nr_buf / nb_workers);
lcores[i]->buf_size = buf_size;
lcores[i]->test_secs = test_secs;
lcores[i]->srcs = srcs + offset;
lcores[i]->dsts = dsts + offset;
lcores[i]->scenario_id = cfg->scenario_id;
lcores[i]->lcore_id = lcore_id;
if (is_dma)
rte_eal_remote_launch(do_dma_mem_copy, (void *)(lcores[i]), lcore_id);
else
rte_eal_remote_launch(do_cpu_mem_copy, (void *)(lcores[i]), lcore_id);
}
while (1) {
bool ready = true;
for (i = 0; i < nb_workers; i++) {
if (lcores[i]->worker_info.ready_flag == false) {
ready = 0;
break;
}
}
if (ready)
break;
}
for (i = 0; i < nb_workers; i++)
lcores[i]->worker_info.start_flag = true;
usleep(TEST_WAIT_U_SECOND);
for (i = 0; i < nb_workers; i++)
lcores[i]->worker_info.test_cpl = lcores[i]->worker_info.total_cpl;
usleep(test_secs * 1000 * 1000);
for (i = 0; i < nb_workers; i++)
lcores[i]->worker_info.test_cpl = lcores[i]->worker_info.total_cpl -
lcores[i]->worker_info.test_cpl;
for (i = 0; i < nb_workers; i++)
lcores[i]->worker_info.stop_flag = true;
rte_eal_mp_wait_lcore();
mops_total = 0;
bandwidth_total = 0;
avg_cycles_total = 0;
for (i = 0; i < nb_workers; i++) {
calc_result(buf_size, nr_buf, nb_workers, test_secs,
lcores[i]->worker_info.test_cpl,
&memory, &avg_cycles, &bandwidth, &mops);
output_result(cfg->scenario_id, lcores[i]->lcore_id,
lcores[i]->dma_name, cfg->ring_size.cur, kick_batch,
avg_cycles, buf_size, nr_buf / nb_workers, memory,
bandwidth, mops, is_dma);
mops_total += mops;
bandwidth_total += bandwidth;
avg_cycles_total += avg_cycles;
}
printf("\nTotal Bandwidth: %.3lf Gbps, Total MOps: %.3lf\n", bandwidth_total, mops_total);
snprintf(output_str[MAX_WORKER_NB], MAX_OUTPUT_STR_LEN, CSV_TOTAL_LINE_FMT,
cfg->scenario_id, nr_buf, memory * nb_workers,
avg_cycles_total / nb_workers, bandwidth_total, mops_total);
out:
/* free mbufs used in the test */
if (srcs != NULL)
rte_pktmbuf_free_bulk(srcs, nr_buf);
if (dsts != NULL)
rte_pktmbuf_free_bulk(dsts, nr_buf);
/* free the points for the mbufs */
rte_free(srcs);
srcs = NULL;
rte_free(dsts);
dsts = NULL;
rte_mempool_free(src_pool);
src_pool = NULL;
rte_mempool_free(dst_pool);
dst_pool = NULL;
/* free the worker parameters */
for (i = 0; i < nb_workers; i++) {
rte_free(lcores[i]);
lcores[i] = NULL;
}
if (is_dma) {
for (i = 0; i < nb_workers; i++) {
printf("Stopping dmadev %d\n", ldm->dma_ids[i]);
rte_dma_stop(ldm->dma_ids[i]);
}
}
}