/* SPDX-License-Identifier: BSD-3-Clause * Copyright(c) 2023 Intel Corporation */ #include #include #include #include #include #include #include #include #include #include "main.h" #define MAX_DMA_CPL_NB 255 #define TEST_WAIT_U_SECOND 10000 #define POLL_MAX 1000 #define CSV_LINE_DMA_FMT "Scenario %u,%u,%s,%u,%u,%u,%u,%.2lf,%" PRIu64 ",%.3lf,%.3lf\n" #define CSV_LINE_CPU_FMT "Scenario %u,%u,NA,NA,NA,%u,%u,%.2lf,%" PRIu64 ",%.3lf,%.3lf\n" #define CSV_TOTAL_LINE_FMT "Scenario %u Summary, , , , , ,%u,%.2lf,%u,%.3lf,%.3lf\n" struct worker_info { bool ready_flag; bool start_flag; bool stop_flag; uint32_t total_cpl; uint32_t test_cpl; }; struct lcore_params { uint8_t scenario_id; unsigned int lcore_id; char *dma_name; uint16_t worker_id; uint16_t dev_id; uint32_t nr_buf; uint16_t kick_batch; uint32_t buf_size; uint16_t test_secs; struct rte_mbuf **srcs; struct rte_mbuf **dsts; volatile struct worker_info worker_info; }; static struct rte_mempool *src_pool; static struct rte_mempool *dst_pool; static struct lcore_params *lcores[MAX_WORKER_NB]; #define PRINT_ERR(...) print_err(__func__, __LINE__, __VA_ARGS__) static inline int __rte_format_printf(3, 4) print_err(const char *func, int lineno, const char *format, ...) { va_list ap; int ret; ret = fprintf(stderr, "In %s:%d - ", func, lineno); va_start(ap, format); ret += vfprintf(stderr, format, ap); va_end(ap); return ret; } static inline void calc_result(uint32_t buf_size, uint32_t nr_buf, uint16_t nb_workers, uint16_t test_secs, uint32_t total_cnt, float *memory, uint32_t *ave_cycle, float *bandwidth, float *mops) { float ops; *memory = (float)(buf_size * (nr_buf / nb_workers) * 2) / (1024 * 1024); *ave_cycle = test_secs * rte_get_timer_hz() / total_cnt; ops = (float)total_cnt / test_secs; *mops = ops / (1000 * 1000); *bandwidth = (ops * buf_size * 8) / (1000 * 1000 * 1000); } static void output_result(uint8_t scenario_id, uint32_t lcore_id, char *dma_name, uint16_t ring_size, uint16_t kick_batch, uint64_t ave_cycle, uint32_t buf_size, uint32_t nr_buf, float memory, float bandwidth, float mops, bool is_dma) { if (is_dma) printf("lcore %u, DMA %s, DMA Ring Size: %u, Kick Batch Size: %u.\n", lcore_id, dma_name, ring_size, kick_batch); else printf("lcore %u\n", lcore_id); printf("Average Cycles/op: %" PRIu64 ", Buffer Size: %u B, Buffer Number: %u, Memory: %.2lf MB, Frequency: %.3lf Ghz.\n", ave_cycle, buf_size, nr_buf, memory, rte_get_timer_hz()/1000000000.0); printf("Average Bandwidth: %.3lf Gbps, MOps: %.3lf\n", bandwidth, mops); if (is_dma) snprintf(output_str[lcore_id], MAX_OUTPUT_STR_LEN, CSV_LINE_DMA_FMT, scenario_id, lcore_id, dma_name, ring_size, kick_batch, buf_size, nr_buf, memory, ave_cycle, bandwidth, mops); else snprintf(output_str[lcore_id], MAX_OUTPUT_STR_LEN, CSV_LINE_CPU_FMT, scenario_id, lcore_id, buf_size, nr_buf, memory, ave_cycle, bandwidth, mops); } static inline void cache_flush_buf(__rte_unused struct rte_mbuf **array, __rte_unused uint32_t buf_size, __rte_unused uint32_t nr_buf) { #ifdef RTE_ARCH_X86_64 char *data; struct rte_mbuf **srcs = array; uint32_t i, offset; for (i = 0; i < nr_buf; i++) { data = rte_pktmbuf_mtod(srcs[i], char *); for (offset = 0; offset < buf_size; offset += 64) __builtin_ia32_clflush(data + offset); } #endif } /* Configuration of device. */ static void configure_dmadev_queue(uint32_t dev_id, uint32_t ring_size) { uint16_t vchan = 0; struct rte_dma_info info; struct rte_dma_conf dev_config = { .nb_vchans = 1 }; struct rte_dma_vchan_conf qconf = { .direction = RTE_DMA_DIR_MEM_TO_MEM, .nb_desc = ring_size }; if (rte_dma_configure(dev_id, &dev_config) != 0) rte_exit(EXIT_FAILURE, "Error with dma configure.\n"); if (rte_dma_vchan_setup(dev_id, vchan, &qconf) != 0) rte_exit(EXIT_FAILURE, "Error with queue configuration.\n"); if (rte_dma_info_get(dev_id, &info) != 0) rte_exit(EXIT_FAILURE, "Error with getting device info.\n"); if (info.nb_vchans != 1) rte_exit(EXIT_FAILURE, "Error, no configured queues reported on device id. %u\n", dev_id); if (rte_dma_start(dev_id) != 0) rte_exit(EXIT_FAILURE, "Error with dma start.\n"); } static int config_dmadevs(struct test_configure *cfg) { uint32_t ring_size = cfg->ring_size.cur; struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map; uint32_t nb_workers = ldm->cnt; uint32_t i; int dev_id; uint16_t nb_dmadevs = 0; char *dma_name; for (i = 0; i < ldm->cnt; i++) { dma_name = ldm->dma_names[i]; dev_id = rte_dma_get_dev_id_by_name(dma_name); if (dev_id < 0) { fprintf(stderr, "Error: Fail to find DMA %s.\n", dma_name); goto end; } ldm->dma_ids[i] = dev_id; configure_dmadev_queue(dev_id, ring_size); ++nb_dmadevs; } end: if (nb_dmadevs < nb_workers) { printf("Not enough dmadevs (%u) for all workers (%u).\n", nb_dmadevs, nb_workers); return -1; } printf("Number of used dmadevs: %u.\n", nb_dmadevs); return 0; } static void error_exit(int dev_id) { rte_dma_stop(dev_id); rte_dma_close(dev_id); rte_exit(EXIT_FAILURE, "DMA error\n"); } static inline void do_dma_submit_and_poll(uint16_t dev_id, uint64_t *async_cnt, volatile struct worker_info *worker_info) { int ret; uint16_t nr_cpl; ret = rte_dma_submit(dev_id, 0); if (ret < 0) error_exit(dev_id); nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB, NULL, NULL); *async_cnt -= nr_cpl; worker_info->total_cpl += nr_cpl; } static inline int do_dma_mem_copy(void *p) { struct lcore_params *para = (struct lcore_params *)p; volatile struct worker_info *worker_info = &(para->worker_info); const uint16_t dev_id = para->dev_id; const uint32_t nr_buf = para->nr_buf; const uint16_t kick_batch = para->kick_batch; const uint32_t buf_size = para->buf_size; struct rte_mbuf **srcs = para->srcs; struct rte_mbuf **dsts = para->dsts; uint16_t nr_cpl; uint64_t async_cnt = 0; uint32_t i; uint32_t poll_cnt = 0; int ret; worker_info->stop_flag = false; worker_info->ready_flag = true; while (!worker_info->start_flag) ; while (1) { for (i = 0; i < nr_buf; i++) { dma_copy: ret = rte_dma_copy(dev_id, 0, rte_mbuf_data_iova(srcs[i]), rte_mbuf_data_iova(dsts[i]), buf_size, 0); if (unlikely(ret < 0)) { if (ret == -ENOSPC) { do_dma_submit_and_poll(dev_id, &async_cnt, worker_info); goto dma_copy; } else error_exit(dev_id); } async_cnt++; if ((async_cnt % kick_batch) == 0) do_dma_submit_and_poll(dev_id, &async_cnt, worker_info); } if (worker_info->stop_flag) break; } rte_dma_submit(dev_id, 0); while ((async_cnt > 0) && (poll_cnt++ < POLL_MAX)) { nr_cpl = rte_dma_completed(dev_id, 0, MAX_DMA_CPL_NB, NULL, NULL); async_cnt -= nr_cpl; } return 0; } static inline int do_cpu_mem_copy(void *p) { struct lcore_params *para = (struct lcore_params *)p; volatile struct worker_info *worker_info = &(para->worker_info); const uint32_t nr_buf = para->nr_buf; const uint32_t buf_size = para->buf_size; struct rte_mbuf **srcs = para->srcs; struct rte_mbuf **dsts = para->dsts; uint32_t i; worker_info->stop_flag = false; worker_info->ready_flag = true; while (!worker_info->start_flag) ; while (1) { for (i = 0; i < nr_buf; i++) { const void *src = rte_pktmbuf_mtod(dsts[i], void *); void *dst = rte_pktmbuf_mtod(srcs[i], void *); /* copy buffer form src to dst */ rte_memcpy(dst, src, (size_t)buf_size); worker_info->total_cpl++; } if (worker_info->stop_flag) break; } return 0; } static int setup_memory_env(struct test_configure *cfg, struct rte_mbuf ***srcs, struct rte_mbuf ***dsts) { unsigned int buf_size = cfg->buf_size.cur; unsigned int nr_sockets; uint32_t nr_buf = cfg->nr_buf; nr_sockets = rte_socket_count(); if (cfg->src_numa_node >= nr_sockets || cfg->dst_numa_node >= nr_sockets) { printf("Error: Source or destination numa exceeds the acture numa nodes.\n"); return -1; } src_pool = rte_pktmbuf_pool_create("Benchmark_DMA_SRC", nr_buf, 0, 0, buf_size + RTE_PKTMBUF_HEADROOM, cfg->src_numa_node); if (src_pool == NULL) { PRINT_ERR("Error with source mempool creation.\n"); return -1; } dst_pool = rte_pktmbuf_pool_create("Benchmark_DMA_DST", nr_buf, 0, 0, buf_size + RTE_PKTMBUF_HEADROOM, cfg->dst_numa_node); if (dst_pool == NULL) { PRINT_ERR("Error with destination mempool creation.\n"); return -1; } *srcs = rte_malloc(NULL, nr_buf * sizeof(struct rte_mbuf *), 0); if (*srcs == NULL) { printf("Error: srcs malloc failed.\n"); return -1; } *dsts = rte_malloc(NULL, nr_buf * sizeof(struct rte_mbuf *), 0); if (*dsts == NULL) { printf("Error: dsts malloc failed.\n"); return -1; } if (rte_pktmbuf_alloc_bulk(src_pool, *srcs, nr_buf) != 0) { printf("alloc src mbufs failed.\n"); return -1; } if (rte_pktmbuf_alloc_bulk(dst_pool, *dsts, nr_buf) != 0) { printf("alloc dst mbufs failed.\n"); return -1; } return 0; } void mem_copy_benchmark(struct test_configure *cfg, bool is_dma) { uint16_t i; uint32_t offset; unsigned int lcore_id = 0; struct rte_mbuf **srcs = NULL, **dsts = NULL; struct lcore_dma_map_t *ldm = &cfg->lcore_dma_map; unsigned int buf_size = cfg->buf_size.cur; uint16_t kick_batch = cfg->kick_batch.cur; uint32_t nr_buf = cfg->nr_buf = (cfg->mem_size.cur * 1024 * 1024) / (cfg->buf_size.cur * 2); uint16_t nb_workers = ldm->cnt; uint16_t test_secs = cfg->test_secs; float memory = 0; uint32_t avg_cycles = 0; uint32_t avg_cycles_total; float mops, mops_total; float bandwidth, bandwidth_total; if (setup_memory_env(cfg, &srcs, &dsts) < 0) goto out; if (is_dma) if (config_dmadevs(cfg) < 0) goto out; if (cfg->cache_flush == 1) { cache_flush_buf(srcs, buf_size, nr_buf); cache_flush_buf(dsts, buf_size, nr_buf); rte_mb(); } printf("Start testing....\n"); for (i = 0; i < nb_workers; i++) { lcore_id = ldm->lcores[i]; offset = nr_buf / nb_workers * i; lcores[i] = rte_malloc(NULL, sizeof(struct lcore_params), 0); if (lcores[i] == NULL) { printf("lcore parameters malloc failure for lcore %d\n", lcore_id); break; } if (is_dma) { lcores[i]->dma_name = ldm->dma_names[i]; lcores[i]->dev_id = ldm->dma_ids[i]; lcores[i]->kick_batch = kick_batch; } lcores[i]->worker_id = i; lcores[i]->nr_buf = (uint32_t)(nr_buf / nb_workers); lcores[i]->buf_size = buf_size; lcores[i]->test_secs = test_secs; lcores[i]->srcs = srcs + offset; lcores[i]->dsts = dsts + offset; lcores[i]->scenario_id = cfg->scenario_id; lcores[i]->lcore_id = lcore_id; if (is_dma) rte_eal_remote_launch(do_dma_mem_copy, (void *)(lcores[i]), lcore_id); else rte_eal_remote_launch(do_cpu_mem_copy, (void *)(lcores[i]), lcore_id); } while (1) { bool ready = true; for (i = 0; i < nb_workers; i++) { if (lcores[i]->worker_info.ready_flag == false) { ready = 0; break; } } if (ready) break; } for (i = 0; i < nb_workers; i++) lcores[i]->worker_info.start_flag = true; usleep(TEST_WAIT_U_SECOND); for (i = 0; i < nb_workers; i++) lcores[i]->worker_info.test_cpl = lcores[i]->worker_info.total_cpl; usleep(test_secs * 1000 * 1000); for (i = 0; i < nb_workers; i++) lcores[i]->worker_info.test_cpl = lcores[i]->worker_info.total_cpl - lcores[i]->worker_info.test_cpl; for (i = 0; i < nb_workers; i++) lcores[i]->worker_info.stop_flag = true; rte_eal_mp_wait_lcore(); mops_total = 0; bandwidth_total = 0; avg_cycles_total = 0; for (i = 0; i < nb_workers; i++) { calc_result(buf_size, nr_buf, nb_workers, test_secs, lcores[i]->worker_info.test_cpl, &memory, &avg_cycles, &bandwidth, &mops); output_result(cfg->scenario_id, lcores[i]->lcore_id, lcores[i]->dma_name, cfg->ring_size.cur, kick_batch, avg_cycles, buf_size, nr_buf / nb_workers, memory, bandwidth, mops, is_dma); mops_total += mops; bandwidth_total += bandwidth; avg_cycles_total += avg_cycles; } printf("\nTotal Bandwidth: %.3lf Gbps, Total MOps: %.3lf\n", bandwidth_total, mops_total); snprintf(output_str[MAX_WORKER_NB], MAX_OUTPUT_STR_LEN, CSV_TOTAL_LINE_FMT, cfg->scenario_id, nr_buf, memory * nb_workers, avg_cycles_total / nb_workers, bandwidth_total, mops_total); out: /* free mbufs used in the test */ if (srcs != NULL) rte_pktmbuf_free_bulk(srcs, nr_buf); if (dsts != NULL) rte_pktmbuf_free_bulk(dsts, nr_buf); /* free the points for the mbufs */ rte_free(srcs); srcs = NULL; rte_free(dsts); dsts = NULL; rte_mempool_free(src_pool); src_pool = NULL; rte_mempool_free(dst_pool); dst_pool = NULL; /* free the worker parameters */ for (i = 0; i < nb_workers; i++) { rte_free(lcores[i]); lcores[i] = NULL; } if (is_dma) { for (i = 0; i < nb_workers; i++) { printf("Stopping dmadev %d\n", ldm->dma_ids[i]); rte_dma_stop(ldm->dma_ids[i]); } } }