/* SPDX-License-Identifier: BSD-3-Clause * Copyright (c) 2022 Marvell. */ #ifndef _CN10K_ML_MODEL_H_ #define _CN10K_ML_MODEL_H_ #include #include #include "cn10k_ml_ocm.h" #include "cnxk_ml_io.h" struct cnxk_ml_dev; struct cnxk_ml_model; struct cnxk_ml_layer; struct cnxk_ml_req; /* Model Metadata : v 2.3.0.1 */ #define MRVL_ML_MODEL_MAGIC_STRING "MRVL" #define MRVL_ML_MODEL_TARGET_ARCH 128 #define MRVL_ML_MODEL_VERSION_MIN 2100 #define MRVL_ML_MODEL_NAME_LEN 64 #define MRVL_ML_INPUT_NAME_LEN 16 #define MRVL_ML_OUTPUT_NAME_LEN 16 #define MRVL_ML_NUM_INPUT_OUTPUT_1 8 #define MRVL_ML_NUM_INPUT_OUTPUT_2 24 #define MRVL_ML_NUM_INPUT_OUTPUT (MRVL_ML_NUM_INPUT_OUTPUT_1 + MRVL_ML_NUM_INPUT_OUTPUT_2) /* Header (256-byte) */ struct cn10k_ml_model_metadata_header { /* Magic string ('M', 'R', 'V', 'L') */ uint8_t magic[4]; /* Metadata version */ uint8_t version[4]; /* Metadata size */ uint32_t metadata_size; /* Unique ID */ uint8_t uuid[128]; /* Model target architecture * 0 = Undefined * 1 = M1K * 128 = MLIP * 256 = Experimental */ uint32_t target_architecture; uint8_t reserved[104]; /* CRC of data after header (i.e. after first 256 bytes) */ uint32_t payload_crc32c; /* CRC of first 252 bytes of header, after payload_crc calculation */ uint32_t header_crc32c; }; /* Model information (256-byte) */ struct cn10k_ml_model_metadata_model { /* Model name string */ uint8_t name[MRVL_ML_MODEL_NAME_LEN]; /* Model version info (xx.xx.xx.xx) */ uint8_t version[4]; /* Model code size (Init + Main + Finish) */ uint32_t code_size; /* Model data size (Weights and Bias) */ uint32_t data_size; /* OCM start offset, set to ocm_wb_range_start */ uint32_t ocm_start; /* OCM start offset, set to max OCM size */ uint32_t ocm_end; /* Relocatable flag (always yes) * 0 = Not relocatable * 1 = Relocatable */ uint8_t ocm_relocatable; /* Tile relocatable flag (always yes) * 0 = Not relocatable * 1 = Relocatable */ uint8_t tile_relocatable; /* Start tile (Always 0) */ uint8_t tile_start; /* End tile (num_tiles - 1) */ uint8_t tile_end; /* Inference batch size */ uint8_t batch_size; /* Number of input tensors (Max 32) */ uint8_t num_input; /* Number of output tensors (Max 32) */ uint8_t num_output; uint8_t reserved_1; /* Total input size in bytes */ uint32_t input_size; /* Total output size in bytes */ uint32_t output_size; /* Table size in bytes */ uint32_t table_size; /* Number of layers in the network */ uint32_t num_layers; uint32_t reserved_2; /* Floor of absolute OCM region */ uint64_t ocm_tmp_range_floor; /* Relative OCM start address of WB data block */ uint64_t ocm_wb_range_start; /* Relative OCM end address of WB data block */ uint64_t ocm_wb_range_end; /* Relative DDR start address of WB data block */ uint64_t ddr_wb_range_start; /* Relative DDR end address of all outputs */ uint64_t ddr_wb_range_end; /* Relative DDR start address of all inputs */ uint64_t ddr_input_range_start; /* Relative DDR end address of all inputs */ uint64_t ddr_input_range_end; /* Relative DDR start address of all outputs */ uint64_t ddr_output_range_start; /* Relative DDR end address of all outputs */ uint64_t ddr_output_range_end; /* Compiler version */ uint8_t compiler_version[8]; /* CDK version */ uint8_t cdk_version[4]; /* Lower batch optimization support * 0 - No, * 1 - Yes */ uint8_t supports_lower_batch_size_optimization; uint8_t reserved_3[3]; /* Relative DDR start address of scratch space */ uint64_t ddr_scratch_range_start; /* Relative DDR end address of scratch space */ uint64_t ddr_scratch_range_end; uint8_t reserved_4[40]; }; /* Init section (64-byte) */ struct cn10k_ml_model_metadata_init_section { uint32_t file_offset; uint32_t file_size; uint8_t reserved[56]; }; /* Main section (64-byte) */ struct cn10k_ml_model_metadata_main_section { uint32_t file_offset; uint32_t file_size; uint8_t reserved[56]; }; /* Finish section (64-byte) */ struct cn10k_ml_model_metadata_finish_section { uint32_t file_offset; uint32_t file_size; uint8_t reserved[56]; }; /* Weights and Bias (64-byte) */ struct cn10k_ml_model_metadata_weights_bias_section { /* Memory offset, set to ddr_wb_range_start */ uint64_t mem_offset; uint32_t file_offset; uint32_t file_size; /* Relocatable flag for WB * 1 = Relocatable * 2 = Not relocatable */ uint8_t relocatable; uint8_t reserved[47]; }; /* Input section (64-byte per input) */ struct cn10k_ml_model_metadata_input_section { /* DDR offset (in OCM absolute addresses for input) */ uint64_t mem_offset; /* Relocatable flag * 1 = Relocatable * 2 = Not relocatable */ uint8_t relocatable; /* Input quantization * 1 = Requires quantization * 2 = Pre-quantized */ uint8_t quantize; /* Type of incoming input * 1 = INT8, 2 = UINT8, 3 = INT16, 4 = UINT16, * 5 = INT32, 6 = UINT32, 7 = FP16, 8 = FP32 */ uint8_t input_type; /* Type of input required by model * 1 = INT8, 2 = UINT8, 3 = INT16, 4 = UINT16, * 5 = INT32, 6 = UINT32, 7 = FP16, 8 = FP32 */ uint8_t model_input_type; /* float_32 qscale value * quantized = non-quantized * qscale */ float qscale; /* Input shape */ struct { /* Input format * 1 = NCHW * 2 = NHWC */ uint8_t format; uint8_t reserved[3]; uint32_t w; uint32_t x; uint32_t y; uint32_t z; } shape; uint8_t reserved[4]; /* Name of input */ uint8_t input_name[MRVL_ML_INPUT_NAME_LEN]; /* DDR range end * new = mem_offset + size_bytes - 1 */ uint64_t ddr_range_end; }; /* Output section (64-byte per output) */ struct cn10k_ml_model_metadata_output_section { /* DDR offset in OCM absolute addresses for output */ uint64_t mem_offset; /* Relocatable flag * 1 = Relocatable * 2 = Not relocatable */ uint8_t relocatable; /* Output dequantization * 1 = De-quantization required * 2 = De-quantization not required */ uint8_t dequantize; /* Type of outgoing output * 1 = INT8, 2 = UINT8, 3 = INT16, 4 = UINT16 * 5 = INT32, 6 = UINT32, 7 = FP16, 8 = FP32 */ uint8_t output_type; /* Type of output produced by model * 1 = INT8, 2 = UINT8, 3 = INT16, 4 = UINT16 * 5 = INT32, 6 = UINT32, 7 = FP16, 8 = FP32 */ uint8_t model_output_type; /* float_32 dscale value * dequantized = quantized * dscale */ float dscale; /* Number of items in the output */ uint32_t size; uint8_t reserved[20]; /* DDR range end * new = mem_offset + size_bytes - 1 */ uint64_t ddr_range_end; uint8_t output_name[MRVL_ML_OUTPUT_NAME_LEN]; }; /* Model data */ struct cn10k_ml_model_metadata_data_section { uint8_t reserved[996]; /* Beta: xx.xx.xx.xx, * Later: YYYYMM.xx.xx */ uint8_t compiler_version[8]; /* M1K CDK version (xx.xx.xx.xx) */ uint8_t m1k_cdk_version[4]; }; /* Model file metadata structure */ struct cn10k_ml_model_metadata { /* Header (256-byte) */ struct cn10k_ml_model_metadata_header header; /* Model information (256-byte) */ struct cn10k_ml_model_metadata_model model; /* Init section (64-byte) */ struct cn10k_ml_model_metadata_init_section init_model; /* Main section (64-byte) */ struct cn10k_ml_model_metadata_main_section main_model; /* Finish section (64-byte) */ struct cn10k_ml_model_metadata_finish_section finish_model; uint8_t reserved_1[512]; /* End of 2k bytes */ /* Weights and Bias (64-byte) */ struct cn10k_ml_model_metadata_weights_bias_section weights_bias; /* Input (512-bytes, 64-byte per input) provisioned for 8 inputs */ struct cn10k_ml_model_metadata_input_section input1[MRVL_ML_NUM_INPUT_OUTPUT_1]; /* Output (512-bytes, 64-byte per output) provisioned for 8 outputs */ struct cn10k_ml_model_metadata_output_section output1[MRVL_ML_NUM_INPUT_OUTPUT_1]; uint8_t reserved_2[1792]; /* Input (1536-bytes, 64-byte per input) provisioned for 24 inputs */ struct cn10k_ml_model_metadata_input_section input2[MRVL_ML_NUM_INPUT_OUTPUT_2]; /* Output (1536-bytes, 64-byte per output) provisioned for 24 outputs */ struct cn10k_ml_model_metadata_output_section output2[MRVL_ML_NUM_INPUT_OUTPUT_2]; /* Model data */ struct cn10k_ml_model_metadata_data_section data; /* Hidden 16 bytes of magic code */ uint8_t reserved_3[16]; }; /* Model address structure */ struct cn10k_ml_layer_addr { /* Base DMA address for load */ void *base_dma_addr_load; /* Init section load address */ void *init_load_addr; /* Main section load address */ void *main_load_addr; /* Finish section load address */ void *finish_load_addr; /* Weights and Bias base address */ void *wb_base_addr; /* Weights and bias load address */ void *wb_load_addr; /* Scratch base address */ void *scratch_base_addr; /* Start tile */ uint8_t tile_start; /* End tile */ uint8_t tile_end; }; /* Model fast-path stats */ struct cn10k_ml_layer_xstats { /* Total hardware latency, sum of all inferences */ uint64_t hw_latency_tot; /* Minimum hardware latency */ uint64_t hw_latency_min; /* Maximum hardware latency */ uint64_t hw_latency_max; /* Total firmware latency, sum of all inferences */ uint64_t fw_latency_tot; /* Minimum firmware latency */ uint64_t fw_latency_min; /* Maximum firmware latency */ uint64_t fw_latency_max; /* Total jobs dequeued */ uint64_t dequeued_count; /* Hardware stats reset index */ uint64_t hw_reset_count; /* Firmware stats reset index */ uint64_t fw_reset_count; }; struct cn10k_ml_layer_data { /* Model / Layer: metadata */ struct cn10k_ml_model_metadata metadata; /* Layer: address structure */ struct cn10k_ml_layer_addr addr; /* Layer: Tile and memory information object */ struct cn10k_ml_ocm_layer_map ocm_map; /* Layer: Slow-path operations request pointer */ struct cnxk_ml_req *req; /* Layer: Stats for burst ops */ struct cn10k_ml_layer_xstats *burst_xstats; /* Layer: Stats for sync ops */ struct cn10k_ml_layer_xstats *sync_xstats; }; struct cn10k_ml_model_data { /* Model / Layer: metadata */ struct cn10k_ml_model_metadata metadata; }; int cn10k_ml_model_metadata_check(uint8_t *buffer, uint64_t size); void cn10k_ml_model_metadata_update(struct cn10k_ml_model_metadata *metadata); void cn10k_ml_layer_addr_update(struct cnxk_ml_layer *layer, uint8_t *buffer, uint8_t *base_dma_addr); void cn10k_ml_layer_io_info_set(struct cnxk_ml_io_info *io_info, struct cn10k_ml_model_metadata *metadata); struct cnxk_ml_io_info *cn10k_ml_model_io_info_get(struct cnxk_ml_model *model, uint16_t layer_id); int cn10k_ml_model_ocm_pages_count(struct cnxk_ml_dev *cnxk_mldev, struct cnxk_ml_layer *layer, uint8_t *buffer, uint16_t *wb_pages, uint16_t *scratch_pages); void cn10k_ml_model_info_set(struct cnxk_ml_dev *cnxk_mldev, struct cnxk_ml_model *model, struct cnxk_ml_io_info *io_info, struct cn10k_ml_model_metadata *metadata); void cn10k_ml_layer_print(struct cnxk_ml_dev *cnxk_mldev, struct cnxk_ml_layer *layer, FILE *fp); int cn10k_ml_model_get_layer_id(struct cnxk_ml_model *model, const char *layer_name, uint16_t *layer_id); #endif /* _CN10K_ML_MODEL_H_ */