f-stack/dpdk/drivers/ml/cnxk/cn10k_ml_model.h

468 lines
11 KiB
C

/* SPDX-License-Identifier: BSD-3-Clause
* Copyright (c) 2022 Marvell.
*/
#ifndef _CN10K_ML_MODEL_H_
#define _CN10K_ML_MODEL_H_
#include <rte_mldev.h>
#include <roc_api.h>
#include "cn10k_ml_ocm.h"
#include "cnxk_ml_io.h"
struct cnxk_ml_dev;
struct cnxk_ml_model;
struct cnxk_ml_layer;
struct cnxk_ml_req;
/* Model Metadata : v 2.3.0.1 */
#define MRVL_ML_MODEL_MAGIC_STRING "MRVL"
#define MRVL_ML_MODEL_TARGET_ARCH 128
#define MRVL_ML_MODEL_VERSION_MIN 2100
#define MRVL_ML_MODEL_NAME_LEN 64
#define MRVL_ML_INPUT_NAME_LEN 16
#define MRVL_ML_OUTPUT_NAME_LEN 16
#define MRVL_ML_NUM_INPUT_OUTPUT_1 8
#define MRVL_ML_NUM_INPUT_OUTPUT_2 24
#define MRVL_ML_NUM_INPUT_OUTPUT (MRVL_ML_NUM_INPUT_OUTPUT_1 + MRVL_ML_NUM_INPUT_OUTPUT_2)
/* Header (256-byte) */
struct cn10k_ml_model_metadata_header {
/* Magic string ('M', 'R', 'V', 'L') */
uint8_t magic[4];
/* Metadata version */
uint8_t version[4];
/* Metadata size */
uint32_t metadata_size;
/* Unique ID */
uint8_t uuid[128];
/* Model target architecture
* 0 = Undefined
* 1 = M1K
* 128 = MLIP
* 256 = Experimental
*/
uint32_t target_architecture;
uint8_t reserved[104];
/* CRC of data after header (i.e. after first 256 bytes) */
uint32_t payload_crc32c;
/* CRC of first 252 bytes of header, after payload_crc calculation */
uint32_t header_crc32c;
};
/* Model information (256-byte) */
struct cn10k_ml_model_metadata_model {
/* Model name string */
uint8_t name[MRVL_ML_MODEL_NAME_LEN];
/* Model version info (xx.xx.xx.xx) */
uint8_t version[4];
/* Model code size (Init + Main + Finish) */
uint32_t code_size;
/* Model data size (Weights and Bias) */
uint32_t data_size;
/* OCM start offset, set to ocm_wb_range_start */
uint32_t ocm_start;
/* OCM start offset, set to max OCM size */
uint32_t ocm_end;
/* Relocatable flag (always yes)
* 0 = Not relocatable
* 1 = Relocatable
*/
uint8_t ocm_relocatable;
/* Tile relocatable flag (always yes)
* 0 = Not relocatable
* 1 = Relocatable
*/
uint8_t tile_relocatable;
/* Start tile (Always 0) */
uint8_t tile_start;
/* End tile (num_tiles - 1) */
uint8_t tile_end;
/* Inference batch size */
uint8_t batch_size;
/* Number of input tensors (Max 32) */
uint8_t num_input;
/* Number of output tensors (Max 32) */
uint8_t num_output;
uint8_t reserved_1;
/* Total input size in bytes */
uint32_t input_size;
/* Total output size in bytes */
uint32_t output_size;
/* Table size in bytes */
uint32_t table_size;
/* Number of layers in the network */
uint32_t num_layers;
uint32_t reserved_2;
/* Floor of absolute OCM region */
uint64_t ocm_tmp_range_floor;
/* Relative OCM start address of WB data block */
uint64_t ocm_wb_range_start;
/* Relative OCM end address of WB data block */
uint64_t ocm_wb_range_end;
/* Relative DDR start address of WB data block */
uint64_t ddr_wb_range_start;
/* Relative DDR end address of all outputs */
uint64_t ddr_wb_range_end;
/* Relative DDR start address of all inputs */
uint64_t ddr_input_range_start;
/* Relative DDR end address of all inputs */
uint64_t ddr_input_range_end;
/* Relative DDR start address of all outputs */
uint64_t ddr_output_range_start;
/* Relative DDR end address of all outputs */
uint64_t ddr_output_range_end;
/* Compiler version */
uint8_t compiler_version[8];
/* CDK version */
uint8_t cdk_version[4];
/* Lower batch optimization support
* 0 - No,
* 1 - Yes
*/
uint8_t supports_lower_batch_size_optimization;
uint8_t reserved_3[3];
/* Relative DDR start address of scratch space */
uint64_t ddr_scratch_range_start;
/* Relative DDR end address of scratch space */
uint64_t ddr_scratch_range_end;
uint8_t reserved_4[40];
};
/* Init section (64-byte) */
struct cn10k_ml_model_metadata_init_section {
uint32_t file_offset;
uint32_t file_size;
uint8_t reserved[56];
};
/* Main section (64-byte) */
struct cn10k_ml_model_metadata_main_section {
uint32_t file_offset;
uint32_t file_size;
uint8_t reserved[56];
};
/* Finish section (64-byte) */
struct cn10k_ml_model_metadata_finish_section {
uint32_t file_offset;
uint32_t file_size;
uint8_t reserved[56];
};
/* Weights and Bias (64-byte) */
struct cn10k_ml_model_metadata_weights_bias_section {
/* Memory offset, set to ddr_wb_range_start */
uint64_t mem_offset;
uint32_t file_offset;
uint32_t file_size;
/* Relocatable flag for WB
* 1 = Relocatable
* 2 = Not relocatable
*/
uint8_t relocatable;
uint8_t reserved[47];
};
/* Input section (64-byte per input) */
struct cn10k_ml_model_metadata_input_section {
/* DDR offset (in OCM absolute addresses for input) */
uint64_t mem_offset;
/* Relocatable flag
* 1 = Relocatable
* 2 = Not relocatable
*/
uint8_t relocatable;
/* Input quantization
* 1 = Requires quantization
* 2 = Pre-quantized
*/
uint8_t quantize;
/* Type of incoming input
* 1 = INT8, 2 = UINT8, 3 = INT16, 4 = UINT16,
* 5 = INT32, 6 = UINT32, 7 = FP16, 8 = FP32
*/
uint8_t input_type;
/* Type of input required by model
* 1 = INT8, 2 = UINT8, 3 = INT16, 4 = UINT16,
* 5 = INT32, 6 = UINT32, 7 = FP16, 8 = FP32
*/
uint8_t model_input_type;
/* float_32 qscale value
* quantized = non-quantized * qscale
*/
float qscale;
/* Input shape */
struct {
/* Input format
* 1 = NCHW
* 2 = NHWC
*/
uint8_t format;
uint8_t reserved[3];
uint32_t w;
uint32_t x;
uint32_t y;
uint32_t z;
} shape;
uint8_t reserved[4];
/* Name of input */
uint8_t input_name[MRVL_ML_INPUT_NAME_LEN];
/* DDR range end
* new = mem_offset + size_bytes - 1
*/
uint64_t ddr_range_end;
};
/* Output section (64-byte per output) */
struct cn10k_ml_model_metadata_output_section {
/* DDR offset in OCM absolute addresses for output */
uint64_t mem_offset;
/* Relocatable flag
* 1 = Relocatable
* 2 = Not relocatable
*/
uint8_t relocatable;
/* Output dequantization
* 1 = De-quantization required
* 2 = De-quantization not required
*/
uint8_t dequantize;
/* Type of outgoing output
* 1 = INT8, 2 = UINT8, 3 = INT16, 4 = UINT16
* 5 = INT32, 6 = UINT32, 7 = FP16, 8 = FP32
*/
uint8_t output_type;
/* Type of output produced by model
* 1 = INT8, 2 = UINT8, 3 = INT16, 4 = UINT16
* 5 = INT32, 6 = UINT32, 7 = FP16, 8 = FP32
*/
uint8_t model_output_type;
/* float_32 dscale value
* dequantized = quantized * dscale
*/
float dscale;
/* Number of items in the output */
uint32_t size;
uint8_t reserved[20];
/* DDR range end
* new = mem_offset + size_bytes - 1
*/
uint64_t ddr_range_end;
uint8_t output_name[MRVL_ML_OUTPUT_NAME_LEN];
};
/* Model data */
struct cn10k_ml_model_metadata_data_section {
uint8_t reserved[996];
/* Beta: xx.xx.xx.xx,
* Later: YYYYMM.xx.xx
*/
uint8_t compiler_version[8];
/* M1K CDK version (xx.xx.xx.xx) */
uint8_t m1k_cdk_version[4];
};
/* Model file metadata structure */
struct cn10k_ml_model_metadata {
/* Header (256-byte) */
struct cn10k_ml_model_metadata_header header;
/* Model information (256-byte) */
struct cn10k_ml_model_metadata_model model;
/* Init section (64-byte) */
struct cn10k_ml_model_metadata_init_section init_model;
/* Main section (64-byte) */
struct cn10k_ml_model_metadata_main_section main_model;
/* Finish section (64-byte) */
struct cn10k_ml_model_metadata_finish_section finish_model;
uint8_t reserved_1[512]; /* End of 2k bytes */
/* Weights and Bias (64-byte) */
struct cn10k_ml_model_metadata_weights_bias_section weights_bias;
/* Input (512-bytes, 64-byte per input) provisioned for 8 inputs */
struct cn10k_ml_model_metadata_input_section input1[MRVL_ML_NUM_INPUT_OUTPUT_1];
/* Output (512-bytes, 64-byte per output) provisioned for 8 outputs */
struct cn10k_ml_model_metadata_output_section output1[MRVL_ML_NUM_INPUT_OUTPUT_1];
uint8_t reserved_2[1792];
/* Input (1536-bytes, 64-byte per input) provisioned for 24 inputs */
struct cn10k_ml_model_metadata_input_section input2[MRVL_ML_NUM_INPUT_OUTPUT_2];
/* Output (1536-bytes, 64-byte per output) provisioned for 24 outputs */
struct cn10k_ml_model_metadata_output_section output2[MRVL_ML_NUM_INPUT_OUTPUT_2];
/* Model data */
struct cn10k_ml_model_metadata_data_section data;
/* Hidden 16 bytes of magic code */
uint8_t reserved_3[16];
};
/* Model address structure */
struct cn10k_ml_layer_addr {
/* Base DMA address for load */
void *base_dma_addr_load;
/* Init section load address */
void *init_load_addr;
/* Main section load address */
void *main_load_addr;
/* Finish section load address */
void *finish_load_addr;
/* Weights and Bias base address */
void *wb_base_addr;
/* Weights and bias load address */
void *wb_load_addr;
/* Scratch base address */
void *scratch_base_addr;
/* Start tile */
uint8_t tile_start;
/* End tile */
uint8_t tile_end;
};
/* Model fast-path stats */
struct cn10k_ml_layer_xstats {
/* Total hardware latency, sum of all inferences */
uint64_t hw_latency_tot;
/* Minimum hardware latency */
uint64_t hw_latency_min;
/* Maximum hardware latency */
uint64_t hw_latency_max;
/* Total firmware latency, sum of all inferences */
uint64_t fw_latency_tot;
/* Minimum firmware latency */
uint64_t fw_latency_min;
/* Maximum firmware latency */
uint64_t fw_latency_max;
/* Total jobs dequeued */
uint64_t dequeued_count;
/* Hardware stats reset index */
uint64_t hw_reset_count;
/* Firmware stats reset index */
uint64_t fw_reset_count;
};
struct cn10k_ml_layer_data {
/* Model / Layer: metadata */
struct cn10k_ml_model_metadata metadata;
/* Layer: address structure */
struct cn10k_ml_layer_addr addr;
/* Layer: Tile and memory information object */
struct cn10k_ml_ocm_layer_map ocm_map;
/* Layer: Slow-path operations request pointer */
struct cnxk_ml_req *req;
/* Layer: Stats for burst ops */
struct cn10k_ml_layer_xstats *burst_xstats;
/* Layer: Stats for sync ops */
struct cn10k_ml_layer_xstats *sync_xstats;
};
struct cn10k_ml_model_data {
/* Model / Layer: metadata */
struct cn10k_ml_model_metadata metadata;
};
int cn10k_ml_model_metadata_check(uint8_t *buffer, uint64_t size);
void cn10k_ml_model_metadata_update(struct cn10k_ml_model_metadata *metadata);
void cn10k_ml_layer_addr_update(struct cnxk_ml_layer *layer, uint8_t *buffer,
uint8_t *base_dma_addr);
void cn10k_ml_layer_io_info_set(struct cnxk_ml_io_info *io_info,
struct cn10k_ml_model_metadata *metadata);
struct cnxk_ml_io_info *cn10k_ml_model_io_info_get(struct cnxk_ml_model *model, uint16_t layer_id);
int cn10k_ml_model_ocm_pages_count(struct cnxk_ml_dev *cnxk_mldev, struct cnxk_ml_layer *layer,
uint8_t *buffer, uint16_t *wb_pages, uint16_t *scratch_pages);
void cn10k_ml_model_info_set(struct cnxk_ml_dev *cnxk_mldev, struct cnxk_ml_model *model,
struct cnxk_ml_io_info *io_info,
struct cn10k_ml_model_metadata *metadata);
void cn10k_ml_layer_print(struct cnxk_ml_dev *cnxk_mldev, struct cnxk_ml_layer *layer, FILE *fp);
int cn10k_ml_model_get_layer_id(struct cnxk_ml_model *model, const char *layer_name,
uint16_t *layer_id);
#endif /* _CN10K_ML_MODEL_H_ */