/* SPDX-License-Identifier: BSD-3-Clause * Copyright(c) 2019 Intel Corporation */ #include #include #include #include #include #include #include #include "test.h" #define STACK_NAME "STACK_PERF" #define MAX_BURST 32 #define STACK_SIZE (RTE_MAX_LCORE * MAX_BURST) /* * Push/pop bulk sizes, marked volatile so they aren't treated as compile-time * constants. */ static volatile unsigned int bulk_sizes[] = {8, MAX_BURST}; static rte_atomic32_t lcore_barrier; struct lcore_pair { unsigned int c1; unsigned int c2; }; static int get_two_hyperthreads(struct lcore_pair *lcp) { unsigned int socket[2]; unsigned int core[2]; unsigned int id[2]; RTE_LCORE_FOREACH(id[0]) { RTE_LCORE_FOREACH(id[1]) { if (id[0] == id[1]) continue; core[0] = rte_lcore_to_cpu_id(id[0]); core[1] = rte_lcore_to_cpu_id(id[1]); socket[0] = rte_lcore_to_socket_id(id[0]); socket[1] = rte_lcore_to_socket_id(id[1]); if ((core[0] == core[1]) && (socket[0] == socket[1])) { lcp->c1 = id[0]; lcp->c2 = id[1]; return 0; } } } return 1; } static int get_two_cores(struct lcore_pair *lcp) { unsigned int socket[2]; unsigned int core[2]; unsigned int id[2]; RTE_LCORE_FOREACH(id[0]) { RTE_LCORE_FOREACH(id[1]) { if (id[0] == id[1]) continue; core[0] = rte_lcore_to_cpu_id(id[0]); core[1] = rte_lcore_to_cpu_id(id[1]); socket[0] = rte_lcore_to_socket_id(id[0]); socket[1] = rte_lcore_to_socket_id(id[1]); if ((core[0] != core[1]) && (socket[0] == socket[1])) { lcp->c1 = id[0]; lcp->c2 = id[1]; return 0; } } } return 1; } static int get_two_sockets(struct lcore_pair *lcp) { unsigned int socket[2]; unsigned int id[2]; RTE_LCORE_FOREACH(id[0]) { RTE_LCORE_FOREACH(id[1]) { if (id[0] == id[1]) continue; socket[0] = rte_lcore_to_socket_id(id[0]); socket[1] = rte_lcore_to_socket_id(id[1]); if (socket[0] != socket[1]) { lcp->c1 = id[0]; lcp->c2 = id[1]; return 0; } } } return 1; } /* Measure the cycle cost of popping an empty stack. */ static void test_empty_pop(struct rte_stack *s) { unsigned int iterations = 100000000; void *objs[MAX_BURST]; unsigned int i; uint64_t start = rte_rdtsc(); for (i = 0; i < iterations; i++) rte_stack_pop(s, objs, bulk_sizes[0]); uint64_t end = rte_rdtsc(); printf("Stack empty pop: %.2F\n", (double)(end - start) / iterations); } struct thread_args { struct rte_stack *s; unsigned int sz; double avg; }; /* Measure the average per-pointer cycle cost of stack push and pop */ static int bulk_push_pop(void *p) { unsigned int iterations = 1000000; struct thread_args *args = p; void *objs[MAX_BURST] = {0}; unsigned int size, i; struct rte_stack *s; s = args->s; size = args->sz; rte_atomic32_sub(&lcore_barrier, 1); while (rte_atomic32_read(&lcore_barrier) != 0) rte_pause(); uint64_t start = rte_rdtsc(); for (i = 0; i < iterations; i++) { rte_stack_push(s, objs, size); rte_stack_pop(s, objs, size); } uint64_t end = rte_rdtsc(); args->avg = ((double)(end - start))/(iterations * size); return 0; } /* * Run bulk_push_pop() simultaneously on pairs of cores, to measure stack * perf when between hyperthread siblings, cores on the same socket, and cores * on different sockets. */ static void run_on_core_pair(struct lcore_pair *cores, struct rte_stack *s, lcore_function_t fn) { struct thread_args args[2]; unsigned int i; for (i = 0; i < RTE_DIM(bulk_sizes); i++) { rte_atomic32_set(&lcore_barrier, 2); args[0].sz = args[1].sz = bulk_sizes[i]; args[0].s = args[1].s = s; if (cores->c1 == rte_get_main_lcore()) { rte_eal_remote_launch(fn, &args[1], cores->c2); fn(&args[0]); rte_eal_wait_lcore(cores->c2); } else { rte_eal_remote_launch(fn, &args[0], cores->c1); rte_eal_remote_launch(fn, &args[1], cores->c2); rte_eal_wait_lcore(cores->c1); rte_eal_wait_lcore(cores->c2); } printf("Average cycles per object push/pop (bulk size: %u): %.2F\n", bulk_sizes[i], (args[0].avg + args[1].avg) / 2); } } /* Run bulk_push_pop() simultaneously on 1+ cores. */ static void run_on_n_cores(struct rte_stack *s, lcore_function_t fn, int n) { struct thread_args args[RTE_MAX_LCORE]; unsigned int i; for (i = 0; i < RTE_DIM(bulk_sizes); i++) { unsigned int lcore_id; int cnt = 0; double avg; rte_atomic32_set(&lcore_barrier, n); RTE_LCORE_FOREACH_WORKER(lcore_id) { if (++cnt >= n) break; args[lcore_id].s = s; args[lcore_id].sz = bulk_sizes[i]; if (rte_eal_remote_launch(fn, &args[lcore_id], lcore_id)) rte_panic("Failed to launch lcore %d\n", lcore_id); } lcore_id = rte_lcore_id(); args[lcore_id].s = s; args[lcore_id].sz = bulk_sizes[i]; fn(&args[lcore_id]); rte_eal_mp_wait_lcore(); avg = args[rte_lcore_id()].avg; cnt = 0; RTE_LCORE_FOREACH_WORKER(lcore_id) { if (++cnt >= n) break; avg += args[lcore_id].avg; } printf("Average cycles per object push/pop (bulk size: %u): %.2F\n", bulk_sizes[i], avg / n); } } /* * Measure the cycle cost of pushing and popping a single pointer on a single * lcore. */ static void test_single_push_pop(struct rte_stack *s) { unsigned int iterations = 16000000; void *obj = NULL; unsigned int i; uint64_t start = rte_rdtsc(); for (i = 0; i < iterations; i++) { rte_stack_push(s, &obj, 1); rte_stack_pop(s, &obj, 1); } uint64_t end = rte_rdtsc(); printf("Average cycles per single object push/pop: %.2F\n", ((double)(end - start)) / iterations); } /* Measure the cycle cost of bulk pushing and popping on a single lcore. */ static void test_bulk_push_pop(struct rte_stack *s) { unsigned int iterations = 8000000; void *objs[MAX_BURST]; unsigned int sz, i; for (sz = 0; sz < RTE_DIM(bulk_sizes); sz++) { uint64_t start = rte_rdtsc(); for (i = 0; i < iterations; i++) { rte_stack_push(s, objs, bulk_sizes[sz]); rte_stack_pop(s, objs, bulk_sizes[sz]); } uint64_t end = rte_rdtsc(); double avg = ((double)(end - start) / (iterations * bulk_sizes[sz])); printf("Average cycles per object push/pop (bulk size: %u): %.2F\n", bulk_sizes[sz], avg); } } static int __test_stack_perf(uint32_t flags) { struct lcore_pair cores; struct rte_stack *s; rte_atomic32_init(&lcore_barrier); s = rte_stack_create(STACK_NAME, STACK_SIZE, rte_socket_id(), flags); if (s == NULL) { printf("[%s():%u] failed to create a stack\n", __func__, __LINE__); return -1; } printf("### Testing single element push/pop ###\n"); test_single_push_pop(s); printf("\n### Testing empty pop ###\n"); test_empty_pop(s); printf("\n### Testing using a single lcore ###\n"); test_bulk_push_pop(s); if (get_two_hyperthreads(&cores) == 0) { printf("\n### Testing using two hyperthreads ###\n"); run_on_core_pair(&cores, s, bulk_push_pop); } if (get_two_cores(&cores) == 0) { printf("\n### Testing using two physical cores ###\n"); run_on_core_pair(&cores, s, bulk_push_pop); } if (get_two_sockets(&cores) == 0) { printf("\n### Testing using two NUMA nodes ###\n"); run_on_core_pair(&cores, s, bulk_push_pop); } printf("\n### Testing on all %u lcores ###\n", rte_lcore_count()); run_on_n_cores(s, bulk_push_pop, rte_lcore_count()); rte_stack_free(s); return 0; } static int test_stack_perf(void) { return __test_stack_perf(0); } static int test_lf_stack_perf(void) { return __test_stack_perf(RTE_STACK_F_LF); } REGISTER_TEST_COMMAND(stack_perf_autotest, test_stack_perf); REGISTER_TEST_COMMAND(stack_lf_perf_autotest, test_lf_stack_perf);