/* SPDX-License-Identifier: BSD-3-Clause
 * Copyright (c) 2022 Marvell.
 */

#ifndef _CN10K_ML_OPS_H_
#define _CN10K_ML_OPS_H_

#include <rte_mldev.h>
#include <rte_mldev_pmd.h>

#include <roc_api.h>

struct cnxk_ml_dev;
struct cnxk_ml_qp;
struct cnxk_ml_model;
struct cnxk_ml_layer;
struct cnxk_ml_req;

/* Firmware version string length */
#define MLDEV_FIRMWARE_VERSION_LENGTH 32

/* Job types */
enum cn10k_ml_job_type {
	ML_CN10K_JOB_TYPE_MODEL_RUN = 0,
	ML_CN10K_JOB_TYPE_MODEL_STOP,
	ML_CN10K_JOB_TYPE_MODEL_START,
	ML_CN10K_JOB_TYPE_FIRMWARE_LOAD,
	ML_CN10K_JOB_TYPE_FIRMWARE_SELFTEST,
};

/* Firmware stats */
struct cn10k_ml_stats {
	/* Firmware start cycle */
	uint64_t fw_start;

	/* Firmware end cycle */
	uint64_t fw_end;

	/* Hardware start cycle */
	uint64_t hw_start;

	/* Hardware end cycle */
	uint64_t hw_end;
};

/* Result structure */
struct cn10k_ml_result {
	/* Job error code */
	uint64_t error_code;

	/* Stats */
	struct cn10k_ml_stats stats;

	/* User context pointer */
	void *user_ptr;
};

/* Firmware capability structure */
union cn10k_ml_fw_cap {
	uint64_t u64;

	struct {
		/* CMPC completion support */
		uint64_t cmpc_completions : 1;

		/* Poll mode completion support */
		uint64_t poll_completions : 1;

		/* SSO completion support */
		uint64_t sso_completions : 1;

		/* Support for model side loading */
		uint64_t side_load_model : 1;

		/* Batch execution */
		uint64_t batch_run : 1;

		/* Max number of models to be loaded in parallel */
		uint64_t max_models : 8;

		/* Firmware statistics */
		uint64_t fw_stats : 1;

		/* Hardware statistics */
		uint64_t hw_stats : 1;

		/* Max number of batches */
		uint64_t max_num_batches : 16;

		uint64_t rsvd : 33;
	} s;
};

/* Firmware debug info structure */
struct cn10k_ml_fw_debug {
	/* ACC core 0 debug buffer */
	uint64_t core0_debug_ptr;

	/* ACC core 1 debug buffer */
	uint64_t core1_debug_ptr;

	/* ACC core 0 exception state buffer */
	uint64_t core0_exception_buffer;

	/* ACC core 1 exception state buffer */
	uint64_t core1_exception_buffer;

	/* Debug buffer size per core */
	uint32_t debug_buffer_size;

	/* Exception state dump size */
	uint32_t exception_state_size;
};

/* Job descriptor header (32 bytes) */
struct cn10k_ml_jd_header {
	/* Job completion structure */
	struct ml_jce_s jce;

	/* Model ID */
	uint64_t model_id : 8;

	/* Job type */
	uint64_t job_type : 8;

	/* Flags for fast-path jobs */
	uint64_t fp_flags : 16;

	/* Flags for slow-path jobs */
	uint64_t sp_flags : 16;
	uint64_t rsvd : 16;

	/* Job result pointer */
	uint64_t *result;
};

/* Extra arguments for job descriptor */
union cn10k_ml_jd_extended_args {
	struct cn10k_ml_jd_extended_args_section_start {
		/* DDR Scratch base address */
		uint64_t ddr_scratch_base_address;

		/* DDR Scratch range start */
		uint64_t ddr_scratch_range_start;

		/* DDR Scratch range end */
		uint64_t ddr_scratch_range_end;

		uint8_t rsvd[104];
	} start;
};

/* Job descriptor structure */
struct cn10k_ml_jd {
	/* Job descriptor header (32 bytes) */
	struct cn10k_ml_jd_header hdr;

	union {
		struct cn10k_ml_jd_section_fw_load {
			/* Firmware capability structure (8 bytes) */
			union cn10k_ml_fw_cap cap;

			/* Firmware version (32 bytes) */
			uint8_t version[MLDEV_FIRMWARE_VERSION_LENGTH];

			/* Debug capability structure (40 bytes) */
			struct cn10k_ml_fw_debug debug;

			/* Flags to control error handling */
			uint64_t flags;

			uint8_t rsvd[8];
		} fw_load;

		struct cn10k_ml_jd_section_model_start {
			/* Extended arguments */
			uint64_t extended_args;

			/* Destination model start address in DDR relative to ML_MLR_BASE */
			uint64_t model_dst_ddr_addr;

			/* Offset to model init section in the model */
			uint64_t model_init_offset : 32;

			/* Size of init section in the model */
			uint64_t model_init_size : 32;

			/* Offset to model main section in the model */
			uint64_t model_main_offset : 32;

			/* Size of main section in the model */
			uint64_t model_main_size : 32;

			/* Offset to model finish section in the model */
			uint64_t model_finish_offset : 32;

			/* Size of finish section in the model */
			uint64_t model_finish_size : 32;

			/* Offset to WB in model bin */
			uint64_t model_wb_offset : 32;

			/* Number of model layers */
			uint64_t num_layers : 8;

			/* Number of gather entries, 0 means linear input mode (= no gather) */
			uint64_t num_gather_entries : 8;

			/* Number of scatter entries 0 means linear input mode (= no scatter) */
			uint64_t num_scatter_entries : 8;

			/* Tile mask to load model */
			uint64_t tilemask : 8;

			/* Batch size of model  */
			uint64_t batch_size : 32;

			/* OCM WB base address */
			uint64_t ocm_wb_base_address : 32;

			/* OCM WB range start */
			uint64_t ocm_wb_range_start : 32;

			/* OCM WB range End */
			uint64_t ocm_wb_range_end : 32;

			/* DDR WB address */
			uint64_t ddr_wb_base_address;

			/* DDR WB range start */
			uint64_t ddr_wb_range_start : 32;

			/* DDR WB range end */
			uint64_t ddr_wb_range_end : 32;

			union {
				/* Points to gather list if num_gather_entries > 0 */
				void *gather_list;
				struct {
					/* Linear input mode */
					uint64_t ddr_range_start : 32;
					uint64_t ddr_range_end : 32;
				} s;
			} input;

			union {
				/* Points to scatter list if num_scatter_entries > 0 */
				void *scatter_list;
				struct {
					/* Linear output mode */
					uint64_t ddr_range_start : 32;
					uint64_t ddr_range_end : 32;
				} s;
			} output;
		} model_start;

		struct cn10k_ml_jd_section_model_stop {
			uint8_t rsvd[96];
		} model_stop;

		struct cn10k_ml_jd_section_model_run {
			/* Address of the input for the run relative to ML_MLR_BASE */
			uint64_t input_ddr_addr;

			/* Address of the output for the run relative to ML_MLR_BASE */
			uint64_t output_ddr_addr;

			/* Number of batches to run in variable batch processing */
			uint16_t num_batches;

			uint8_t rsvd[78];
		} model_run;
	};
} __plt_aligned(ROC_ALIGN);

/* CN10K specific request */
struct cn10k_ml_req {
	/* Job descriptor */
	struct cn10k_ml_jd jd;

	/* Job descriptor extra arguments */
	union cn10k_ml_jd_extended_args extended_args;

	/* Status field for poll mode requests */
	volatile uint64_t status;

	/* Job command */
	struct ml_job_cmd_s jcmd;

	/* Result */
	struct cn10k_ml_result result;
};

/* Device ops */
int cn10k_ml_dev_info_get(struct cnxk_ml_dev *cnxk_mldev, struct rte_ml_dev_info *dev_info);
int cn10k_ml_dev_configure(struct cnxk_ml_dev *cnxk_mldev, const struct rte_ml_dev_config *conf);
int cn10k_ml_dev_close(struct cnxk_ml_dev *cnxk_mldev);
int cn10k_ml_dev_start(struct cnxk_ml_dev *cnxk_mldev);
int cn10k_ml_dev_stop(struct cnxk_ml_dev *cnxk_mldev);
int cn10k_ml_dev_dump(struct cnxk_ml_dev *cnxk_mldev, FILE *fp);
int cn10k_ml_dev_selftest(struct cnxk_ml_dev *cnxk_mldev);

/* Slow-path ops */
int cn10k_ml_model_load(struct cnxk_ml_dev *cnxk_mldev, struct rte_ml_model_params *params,
			struct cnxk_ml_model *model);
int cn10k_ml_model_unload(struct cnxk_ml_dev *cnxk_mldev, struct cnxk_ml_model *model);
int cn10k_ml_model_start(struct cnxk_ml_dev *cnxk_mldev, struct cnxk_ml_model *model);
int cn10k_ml_model_stop(struct cnxk_ml_dev *cnxk_mldev, struct cnxk_ml_model *model);
int cn10k_ml_model_params_update(struct cnxk_ml_dev *cnxk_mldev, struct cnxk_ml_model *model,
				 void *buffer);

/* Fast-path ops */
__rte_hot bool cn10k_ml_enqueue_single(struct cnxk_ml_dev *cnxk_mldev, struct rte_ml_op *op,
				       uint16_t layer_id, struct cnxk_ml_qp *qp, uint64_t head);
__rte_hot int cn10k_ml_op_error_get(struct rte_ml_dev *dev, struct rte_ml_op *op,
				    struct rte_ml_op_error *error);
__rte_hot int cn10k_ml_inference_sync(void *device, uint16_t index, void *input, void *output,
				      uint16_t nb_batches);
__rte_hot void cn10k_ml_result_update(struct cnxk_ml_dev *cnxk_mldev, int qp_id, void *request);
__rte_hot void cn10k_ml_set_error_code(struct cnxk_ml_req *req, uint64_t etype, uint64_t stype);
__rte_hot void cn10k_ml_set_poll_addr(struct cnxk_ml_req *req);

/* Misc ops */
void cn10k_ml_qp_initialize(struct cnxk_ml_dev *cnxk_mldev, struct cnxk_ml_qp *qp);

/* Layer ops */
int cn10k_ml_layer_load(void *device, uint16_t model_id, const char *layer_name, uint8_t *buffer,
			size_t size, uint16_t *index);
int cn10k_ml_layer_unload(void *device, uint16_t model_id, const char *layer_name);
int cn10k_ml_layer_start(void *device, uint16_t model_id, const char *layer_name);
int cn10k_ml_layer_stop(void *device, uint16_t model_id, const char *layer_name);
int cn10k_ml_io_alloc(void *device, uint16_t model_id, const char *layer_name,
		      uint64_t **input_qbuffer, uint64_t **output_qbuffer);
int cn10k_ml_io_free(void *device, uint16_t model_id, const char *layer_name);

int cn10k_ml_malloc(const char *name, size_t size, uint32_t align, void **addr);
int cn10k_ml_free(const char *name);

/* xstats ops */
void cn10k_ml_xstat_model_name_set(struct cnxk_ml_dev *cnxk_mldev, struct cnxk_ml_model *model,
				   uint16_t stat_id, uint16_t entry, char *suffix);
uint64_t cn10k_ml_model_xstat_get(struct cnxk_ml_dev *cnxk_mldev, struct cnxk_ml_layer *layer,
				  enum cnxk_ml_xstats_type type);

#endif /* _CN10K_ML_OPS_H_ */