/* SPDX-License-Identifier: BSD-3-Clause
 * Copyright(c) 2020 Intel Corporation
 */

#include "test_ring_stress.h"

/**
 * Stress test for ring enqueue/dequeue operations.
 * Performs the following pattern on each worker:
 * dequeue/read-write data from the dequeued objects/enqueue.
 * Serves as both functional and performance test of ring
 * enqueue/dequeue operations under high contention
 * (for both over committed and non-over committed scenarios).
 */

#define RING_NAME	"RING_STRESS"
#define BULK_NUM	32
#define RING_SIZE	(2 * BULK_NUM * RTE_MAX_LCORE)

enum {
	WRK_CMD_STOP,
	WRK_CMD_RUN,
};

static volatile uint32_t wrk_cmd __rte_cache_aligned;

/* test run-time in seconds */
static const uint32_t run_time = 60;
static const uint32_t verbose;

struct lcore_stat {
	uint64_t nb_cycle;
	struct {
		uint64_t nb_call;
		uint64_t nb_obj;
		uint64_t nb_cycle;
		uint64_t max_cycle;
		uint64_t min_cycle;
	} op;
};

struct lcore_arg {
	struct rte_ring *rng;
	struct lcore_stat stats;
} __rte_cache_aligned;

struct ring_elem {
	uint32_t cnt[RTE_CACHE_LINE_SIZE / sizeof(uint32_t)];
} __rte_cache_aligned;

/*
 * redefinable functions
 */
static uint32_t
_st_ring_dequeue_bulk(struct rte_ring *r, void **obj, uint32_t n,
	uint32_t *avail);

static uint32_t
_st_ring_enqueue_bulk(struct rte_ring *r, void * const *obj, uint32_t n,
	uint32_t *free);

static int
_st_ring_init(struct rte_ring *r, const char *name, uint32_t num);


static void
lcore_stat_update(struct lcore_stat *ls, uint64_t call, uint64_t obj,
	uint64_t tm, int32_t prcs)
{
	ls->op.nb_call += call;
	ls->op.nb_obj += obj;
	ls->op.nb_cycle += tm;
	if (prcs) {
		ls->op.max_cycle = RTE_MAX(ls->op.max_cycle, tm);
		ls->op.min_cycle = RTE_MIN(ls->op.min_cycle, tm);
	}
}

static void
lcore_op_stat_aggr(struct lcore_stat *ms, const struct lcore_stat *ls)
{

	ms->op.nb_call += ls->op.nb_call;
	ms->op.nb_obj += ls->op.nb_obj;
	ms->op.nb_cycle += ls->op.nb_cycle;
	ms->op.max_cycle = RTE_MAX(ms->op.max_cycle, ls->op.max_cycle);
	ms->op.min_cycle = RTE_MIN(ms->op.min_cycle, ls->op.min_cycle);
}

static void
lcore_stat_aggr(struct lcore_stat *ms, const struct lcore_stat *ls)
{
	ms->nb_cycle = RTE_MAX(ms->nb_cycle, ls->nb_cycle);
	lcore_op_stat_aggr(ms, ls);
}

static void
lcore_stat_dump(FILE *f, uint32_t lc, const struct lcore_stat *ls)
{
	long double st;

	st = (long double)rte_get_timer_hz() / US_PER_S;

	if (lc == UINT32_MAX)
		fprintf(f, "%s(AGGREGATE)={\n", __func__);
	else
		fprintf(f, "%s(lcore=%u)={\n", __func__, lc);

	fprintf(f, "\tnb_cycle=%" PRIu64 "(%.2Lf usec),\n",
		ls->nb_cycle, (long double)ls->nb_cycle / st);

	fprintf(f, "\tDEQ+ENQ={\n");

	fprintf(f, "\t\tnb_call=%" PRIu64 ",\n", ls->op.nb_call);
	fprintf(f, "\t\tnb_obj=%" PRIu64 ",\n", ls->op.nb_obj);
	fprintf(f, "\t\tnb_cycle=%" PRIu64 ",\n", ls->op.nb_cycle);
	fprintf(f, "\t\tobj/call(avg): %.2Lf\n",
		(long double)ls->op.nb_obj / ls->op.nb_call);
	fprintf(f, "\t\tcycles/obj(avg): %.2Lf\n",
		(long double)ls->op.nb_cycle / ls->op.nb_obj);
	fprintf(f, "\t\tcycles/call(avg): %.2Lf\n",
		(long double)ls->op.nb_cycle / ls->op.nb_call);

	/* if min/max cycles per call stats was collected */
	if (ls->op.min_cycle != UINT64_MAX) {
		fprintf(f, "\t\tmax cycles/call=%" PRIu64 "(%.2Lf usec),\n",
			ls->op.max_cycle,
			(long double)ls->op.max_cycle / st);
		fprintf(f, "\t\tmin cycles/call=%" PRIu64 "(%.2Lf usec),\n",
			ls->op.min_cycle,
			(long double)ls->op.min_cycle / st);
	}

	fprintf(f, "\t},\n");
	fprintf(f, "};\n");
}

static void
fill_ring_elm(struct ring_elem *elm, uint32_t fill)
{
	uint32_t i;

	for (i = 0; i != RTE_DIM(elm->cnt); i++)
		elm->cnt[i] = fill;
}

static int32_t
check_updt_elem(struct ring_elem *elm[], uint32_t num,
	const struct ring_elem *check, const struct ring_elem *fill)
{
	uint32_t i;

	static rte_spinlock_t dump_lock;

	for (i = 0; i != num; i++) {
		if (memcmp(check, elm[i], sizeof(*check)) != 0) {
			rte_spinlock_lock(&dump_lock);
			printf("%s(lc=%u, num=%u) failed at %u-th iter, "
				"offending object: %p\n",
				__func__, rte_lcore_id(), num, i, elm[i]);
			rte_memdump(stdout, "expected", check, sizeof(*check));
			rte_memdump(stdout, "result", elm[i], sizeof(*elm[i]));
			rte_spinlock_unlock(&dump_lock);
			return -EINVAL;
		}
		memcpy(elm[i], fill, sizeof(*elm[i]));
	}

	return 0;
}

static int
check_ring_op(uint32_t exp, uint32_t res, uint32_t lc,
	const char *fname, const char *opname)
{
	if (exp != res) {
		printf("%s(lc=%u) failure: %s expected: %u, returned %u\n",
			fname, lc, opname, exp, res);
		return -ENOSPC;
	}
	return 0;
}

static int
test_worker(void *arg, const char *fname, int32_t prcs)
{
	int32_t rc;
	uint32_t lc, n, num;
	uint64_t cl, tm0, tm1;
	struct lcore_arg *la;
	struct ring_elem def_elm, loc_elm;
	struct ring_elem *obj[2 * BULK_NUM];

	la = arg;
	lc = rte_lcore_id();

	fill_ring_elm(&def_elm, UINT32_MAX);
	fill_ring_elm(&loc_elm, lc);

	while (wrk_cmd != WRK_CMD_RUN) {
		rte_smp_rmb();
		rte_pause();
	}

	cl = rte_rdtsc_precise();

	do {
		/* num in interval [7/8, 11/8] of BULK_NUM */
		num = 7 * BULK_NUM / 8 + rte_rand() % (BULK_NUM / 2);

		/* reset all pointer values */
		memset(obj, 0, sizeof(obj));

		/* dequeue num elems */
		tm0 = (prcs != 0) ? rte_rdtsc_precise() : 0;
		n = _st_ring_dequeue_bulk(la->rng, (void **)obj, num, NULL);
		tm0 = (prcs != 0) ? rte_rdtsc_precise() - tm0 : 0;

		/* check return value and objects */
		rc = check_ring_op(num, n, lc, fname,
			RTE_STR(_st_ring_dequeue_bulk));
		if (rc == 0)
			rc = check_updt_elem(obj, num, &def_elm, &loc_elm);
		if (rc != 0)
			break;

		/* enqueue num elems */
		rte_compiler_barrier();
		rc = check_updt_elem(obj, num, &loc_elm, &def_elm);
		if (rc != 0)
			break;

		tm1 = (prcs != 0) ? rte_rdtsc_precise() : 0;
		n = _st_ring_enqueue_bulk(la->rng, (void **)obj, num, NULL);
		tm1 = (prcs != 0) ? rte_rdtsc_precise() - tm1 : 0;

		/* check return value */
		rc = check_ring_op(num, n, lc, fname,
			RTE_STR(_st_ring_enqueue_bulk));
		if (rc != 0)
			break;

		lcore_stat_update(&la->stats, 1, num, tm0 + tm1, prcs);

	} while (wrk_cmd == WRK_CMD_RUN);

	cl = rte_rdtsc_precise() - cl;
	if (prcs == 0)
		lcore_stat_update(&la->stats, 0, 0, cl, 0);
	la->stats.nb_cycle = cl;
	return rc;
}
static int
test_worker_prcs(void *arg)
{
	return test_worker(arg, __func__, 1);
}

static int
test_worker_avg(void *arg)
{
	return test_worker(arg, __func__, 0);
}

static void
mt1_fini(struct rte_ring *rng, void *data)
{
	rte_free(rng);
	rte_free(data);
}

static int
mt1_init(struct rte_ring **rng, void **data, uint32_t num)
{
	int32_t rc;
	size_t sz;
	uint32_t i, nr;
	struct rte_ring *r;
	struct ring_elem *elm;
	void *p;

	*rng = NULL;
	*data = NULL;

	sz = num * sizeof(*elm);
	elm = rte_zmalloc(NULL, sz, __alignof__(*elm));
	if (elm == NULL) {
		printf("%s: alloc(%zu) for %u elems data failed",
			__func__, sz, num);
		return -ENOMEM;
	}

	*data = elm;

	/* alloc ring */
	nr = 2 * num;
	sz = rte_ring_get_memsize(nr);
	r = rte_zmalloc(NULL, sz, __alignof__(*r));
	if (r == NULL) {
		printf("%s: alloc(%zu) for FIFO with %u elems failed",
			__func__, sz, nr);
		return -ENOMEM;
	}

	*rng = r;

	rc = _st_ring_init(r, RING_NAME, nr);
	if (rc != 0) {
		printf("%s: _st_ring_init(%p, %u) failed, error: %d(%s)\n",
			__func__, r, nr, rc, strerror(-rc));
		return rc;
	}

	for (i = 0; i != num; i++) {
		fill_ring_elm(elm + i, UINT32_MAX);
		p = elm + i;
		if (_st_ring_enqueue_bulk(r, &p, 1, NULL) != 1)
			break;
	}

	if (i != num) {
		printf("%s: _st_ring_enqueue(%p, %u) returned %u\n",
			__func__, r, num, i);
		return -ENOSPC;
	}

	return 0;
}

static int
test_mt1(int (*test)(void *))
{
	int32_t rc;
	uint32_t lc, mc;
	struct rte_ring *r;
	void *data;
	struct lcore_arg arg[RTE_MAX_LCORE];

	static const struct lcore_stat init_stat = {
		.op.min_cycle = UINT64_MAX,
	};

	rc = mt1_init(&r, &data, RING_SIZE);
	if (rc != 0) {
		mt1_fini(r, data);
		return rc;
	}

	memset(arg, 0, sizeof(arg));

	/* launch on all workers */
	RTE_LCORE_FOREACH_WORKER(lc) {
		arg[lc].rng = r;
		arg[lc].stats = init_stat;
		rte_eal_remote_launch(test, &arg[lc], lc);
	}

	/* signal worker to start test */
	wrk_cmd = WRK_CMD_RUN;
	rte_smp_wmb();

	usleep(run_time * US_PER_S);

	/* signal worker to start test */
	wrk_cmd = WRK_CMD_STOP;
	rte_smp_wmb();

	/* wait for workers and collect stats. */
	mc = rte_lcore_id();
	arg[mc].stats = init_stat;

	rc = 0;
	RTE_LCORE_FOREACH_WORKER(lc) {
		rc |= rte_eal_wait_lcore(lc);
		lcore_stat_aggr(&arg[mc].stats, &arg[lc].stats);
		if (verbose != 0)
			lcore_stat_dump(stdout, lc, &arg[lc].stats);
	}

	lcore_stat_dump(stdout, UINT32_MAX, &arg[mc].stats);
	mt1_fini(r, data);
	return rc;
}

static const struct test_case tests[] = {
	{
		.name = "MT-WRK_ENQ_DEQ-MST_NONE-PRCS",
		.func = test_mt1,
		.wfunc = test_worker_prcs,
	},
	{
		.name = "MT-WRK_ENQ_DEQ-MST_NONE-AVG",
		.func = test_mt1,
		.wfunc = test_worker_avg,
	},
};