mirror of https://github.com/F-Stack/f-stack.git
468 lines
11 KiB
C
468 lines
11 KiB
C
/* SPDX-License-Identifier: BSD-3-Clause
|
|
* Copyright (c) 2022 Marvell.
|
|
*/
|
|
|
|
#ifndef _CN10K_ML_MODEL_H_
|
|
#define _CN10K_ML_MODEL_H_
|
|
|
|
#include <rte_mldev.h>
|
|
|
|
#include <roc_api.h>
|
|
|
|
#include "cn10k_ml_ocm.h"
|
|
|
|
#include "cnxk_ml_io.h"
|
|
|
|
struct cnxk_ml_dev;
|
|
struct cnxk_ml_model;
|
|
struct cnxk_ml_layer;
|
|
struct cnxk_ml_req;
|
|
|
|
/* Model Metadata : v 2.3.0.1 */
|
|
#define MRVL_ML_MODEL_MAGIC_STRING "MRVL"
|
|
#define MRVL_ML_MODEL_TARGET_ARCH 128
|
|
#define MRVL_ML_MODEL_VERSION_MIN 2100
|
|
#define MRVL_ML_MODEL_NAME_LEN 64
|
|
#define MRVL_ML_INPUT_NAME_LEN 16
|
|
#define MRVL_ML_OUTPUT_NAME_LEN 16
|
|
#define MRVL_ML_NUM_INPUT_OUTPUT_1 8
|
|
#define MRVL_ML_NUM_INPUT_OUTPUT_2 24
|
|
#define MRVL_ML_NUM_INPUT_OUTPUT (MRVL_ML_NUM_INPUT_OUTPUT_1 + MRVL_ML_NUM_INPUT_OUTPUT_2)
|
|
|
|
/* Header (256-byte) */
|
|
struct cn10k_ml_model_metadata_header {
|
|
/* Magic string ('M', 'R', 'V', 'L') */
|
|
uint8_t magic[4];
|
|
|
|
/* Metadata version */
|
|
uint8_t version[4];
|
|
|
|
/* Metadata size */
|
|
uint32_t metadata_size;
|
|
|
|
/* Unique ID */
|
|
uint8_t uuid[128];
|
|
|
|
/* Model target architecture
|
|
* 0 = Undefined
|
|
* 1 = M1K
|
|
* 128 = MLIP
|
|
* 256 = Experimental
|
|
*/
|
|
uint32_t target_architecture;
|
|
uint8_t reserved[104];
|
|
|
|
/* CRC of data after header (i.e. after first 256 bytes) */
|
|
uint32_t payload_crc32c;
|
|
|
|
/* CRC of first 252 bytes of header, after payload_crc calculation */
|
|
uint32_t header_crc32c;
|
|
};
|
|
|
|
/* Model information (256-byte) */
|
|
struct cn10k_ml_model_metadata_model {
|
|
/* Model name string */
|
|
uint8_t name[MRVL_ML_MODEL_NAME_LEN];
|
|
|
|
/* Model version info (xx.xx.xx.xx) */
|
|
uint8_t version[4];
|
|
|
|
/* Model code size (Init + Main + Finish) */
|
|
uint32_t code_size;
|
|
|
|
/* Model data size (Weights and Bias) */
|
|
uint32_t data_size;
|
|
|
|
/* OCM start offset, set to ocm_wb_range_start */
|
|
uint32_t ocm_start;
|
|
|
|
/* OCM start offset, set to max OCM size */
|
|
uint32_t ocm_end;
|
|
|
|
/* Relocatable flag (always yes)
|
|
* 0 = Not relocatable
|
|
* 1 = Relocatable
|
|
*/
|
|
uint8_t ocm_relocatable;
|
|
|
|
/* Tile relocatable flag (always yes)
|
|
* 0 = Not relocatable
|
|
* 1 = Relocatable
|
|
*/
|
|
uint8_t tile_relocatable;
|
|
|
|
/* Start tile (Always 0) */
|
|
uint8_t tile_start;
|
|
|
|
/* End tile (num_tiles - 1) */
|
|
uint8_t tile_end;
|
|
|
|
/* Inference batch size */
|
|
uint8_t batch_size;
|
|
|
|
/* Number of input tensors (Max 32) */
|
|
uint8_t num_input;
|
|
|
|
/* Number of output tensors (Max 32) */
|
|
uint8_t num_output;
|
|
uint8_t reserved_1;
|
|
|
|
/* Total input size in bytes */
|
|
uint32_t input_size;
|
|
|
|
/* Total output size in bytes */
|
|
uint32_t output_size;
|
|
|
|
/* Table size in bytes */
|
|
uint32_t table_size;
|
|
|
|
/* Number of layers in the network */
|
|
uint32_t num_layers;
|
|
uint32_t reserved_2;
|
|
|
|
/* Floor of absolute OCM region */
|
|
uint64_t ocm_tmp_range_floor;
|
|
|
|
/* Relative OCM start address of WB data block */
|
|
uint64_t ocm_wb_range_start;
|
|
|
|
/* Relative OCM end address of WB data block */
|
|
uint64_t ocm_wb_range_end;
|
|
|
|
/* Relative DDR start address of WB data block */
|
|
uint64_t ddr_wb_range_start;
|
|
|
|
/* Relative DDR end address of all outputs */
|
|
uint64_t ddr_wb_range_end;
|
|
|
|
/* Relative DDR start address of all inputs */
|
|
uint64_t ddr_input_range_start;
|
|
|
|
/* Relative DDR end address of all inputs */
|
|
uint64_t ddr_input_range_end;
|
|
|
|
/* Relative DDR start address of all outputs */
|
|
uint64_t ddr_output_range_start;
|
|
|
|
/* Relative DDR end address of all outputs */
|
|
uint64_t ddr_output_range_end;
|
|
|
|
/* Compiler version */
|
|
uint8_t compiler_version[8];
|
|
|
|
/* CDK version */
|
|
uint8_t cdk_version[4];
|
|
|
|
/* Lower batch optimization support
|
|
* 0 - No,
|
|
* 1 - Yes
|
|
*/
|
|
uint8_t supports_lower_batch_size_optimization;
|
|
uint8_t reserved_3[3];
|
|
|
|
/* Relative DDR start address of scratch space */
|
|
uint64_t ddr_scratch_range_start;
|
|
|
|
/* Relative DDR end address of scratch space */
|
|
uint64_t ddr_scratch_range_end;
|
|
uint8_t reserved_4[40];
|
|
};
|
|
|
|
/* Init section (64-byte) */
|
|
struct cn10k_ml_model_metadata_init_section {
|
|
uint32_t file_offset;
|
|
uint32_t file_size;
|
|
uint8_t reserved[56];
|
|
};
|
|
|
|
/* Main section (64-byte) */
|
|
struct cn10k_ml_model_metadata_main_section {
|
|
uint32_t file_offset;
|
|
uint32_t file_size;
|
|
uint8_t reserved[56];
|
|
};
|
|
|
|
/* Finish section (64-byte) */
|
|
struct cn10k_ml_model_metadata_finish_section {
|
|
uint32_t file_offset;
|
|
uint32_t file_size;
|
|
uint8_t reserved[56];
|
|
};
|
|
|
|
/* Weights and Bias (64-byte) */
|
|
struct cn10k_ml_model_metadata_weights_bias_section {
|
|
/* Memory offset, set to ddr_wb_range_start */
|
|
uint64_t mem_offset;
|
|
uint32_t file_offset;
|
|
uint32_t file_size;
|
|
|
|
/* Relocatable flag for WB
|
|
* 1 = Relocatable
|
|
* 2 = Not relocatable
|
|
*/
|
|
uint8_t relocatable;
|
|
uint8_t reserved[47];
|
|
};
|
|
|
|
/* Input section (64-byte per input) */
|
|
struct cn10k_ml_model_metadata_input_section {
|
|
/* DDR offset (in OCM absolute addresses for input) */
|
|
uint64_t mem_offset;
|
|
|
|
/* Relocatable flag
|
|
* 1 = Relocatable
|
|
* 2 = Not relocatable
|
|
*/
|
|
uint8_t relocatable;
|
|
|
|
/* Input quantization
|
|
* 1 = Requires quantization
|
|
* 2 = Pre-quantized
|
|
*/
|
|
uint8_t quantize;
|
|
|
|
/* Type of incoming input
|
|
* 1 = INT8, 2 = UINT8, 3 = INT16, 4 = UINT16,
|
|
* 5 = INT32, 6 = UINT32, 7 = FP16, 8 = FP32
|
|
*/
|
|
uint8_t input_type;
|
|
|
|
/* Type of input required by model
|
|
* 1 = INT8, 2 = UINT8, 3 = INT16, 4 = UINT16,
|
|
* 5 = INT32, 6 = UINT32, 7 = FP16, 8 = FP32
|
|
*/
|
|
uint8_t model_input_type;
|
|
|
|
/* float_32 qscale value
|
|
* quantized = non-quantized * qscale
|
|
*/
|
|
float qscale;
|
|
|
|
/* Input shape */
|
|
struct {
|
|
/* Input format
|
|
* 1 = NCHW
|
|
* 2 = NHWC
|
|
*/
|
|
uint8_t format;
|
|
uint8_t reserved[3];
|
|
uint32_t w;
|
|
uint32_t x;
|
|
uint32_t y;
|
|
uint32_t z;
|
|
} shape;
|
|
uint8_t reserved[4];
|
|
|
|
/* Name of input */
|
|
uint8_t input_name[MRVL_ML_INPUT_NAME_LEN];
|
|
|
|
/* DDR range end
|
|
* new = mem_offset + size_bytes - 1
|
|
*/
|
|
uint64_t ddr_range_end;
|
|
};
|
|
|
|
/* Output section (64-byte per output) */
|
|
struct cn10k_ml_model_metadata_output_section {
|
|
/* DDR offset in OCM absolute addresses for output */
|
|
uint64_t mem_offset;
|
|
|
|
/* Relocatable flag
|
|
* 1 = Relocatable
|
|
* 2 = Not relocatable
|
|
*/
|
|
uint8_t relocatable;
|
|
|
|
/* Output dequantization
|
|
* 1 = De-quantization required
|
|
* 2 = De-quantization not required
|
|
*/
|
|
uint8_t dequantize;
|
|
|
|
/* Type of outgoing output
|
|
* 1 = INT8, 2 = UINT8, 3 = INT16, 4 = UINT16
|
|
* 5 = INT32, 6 = UINT32, 7 = FP16, 8 = FP32
|
|
*/
|
|
uint8_t output_type;
|
|
|
|
/* Type of output produced by model
|
|
* 1 = INT8, 2 = UINT8, 3 = INT16, 4 = UINT16
|
|
* 5 = INT32, 6 = UINT32, 7 = FP16, 8 = FP32
|
|
*/
|
|
uint8_t model_output_type;
|
|
|
|
/* float_32 dscale value
|
|
* dequantized = quantized * dscale
|
|
*/
|
|
float dscale;
|
|
|
|
/* Number of items in the output */
|
|
uint32_t size;
|
|
uint8_t reserved[20];
|
|
|
|
/* DDR range end
|
|
* new = mem_offset + size_bytes - 1
|
|
*/
|
|
uint64_t ddr_range_end;
|
|
uint8_t output_name[MRVL_ML_OUTPUT_NAME_LEN];
|
|
};
|
|
|
|
/* Model data */
|
|
struct cn10k_ml_model_metadata_data_section {
|
|
uint8_t reserved[996];
|
|
|
|
/* Beta: xx.xx.xx.xx,
|
|
* Later: YYYYMM.xx.xx
|
|
*/
|
|
uint8_t compiler_version[8];
|
|
|
|
/* M1K CDK version (xx.xx.xx.xx) */
|
|
uint8_t m1k_cdk_version[4];
|
|
};
|
|
|
|
/* Model file metadata structure */
|
|
struct cn10k_ml_model_metadata {
|
|
/* Header (256-byte) */
|
|
struct cn10k_ml_model_metadata_header header;
|
|
|
|
/* Model information (256-byte) */
|
|
struct cn10k_ml_model_metadata_model model;
|
|
|
|
/* Init section (64-byte) */
|
|
struct cn10k_ml_model_metadata_init_section init_model;
|
|
|
|
/* Main section (64-byte) */
|
|
struct cn10k_ml_model_metadata_main_section main_model;
|
|
|
|
/* Finish section (64-byte) */
|
|
struct cn10k_ml_model_metadata_finish_section finish_model;
|
|
|
|
uint8_t reserved_1[512]; /* End of 2k bytes */
|
|
|
|
/* Weights and Bias (64-byte) */
|
|
struct cn10k_ml_model_metadata_weights_bias_section weights_bias;
|
|
|
|
/* Input (512-bytes, 64-byte per input) provisioned for 8 inputs */
|
|
struct cn10k_ml_model_metadata_input_section input1[MRVL_ML_NUM_INPUT_OUTPUT_1];
|
|
|
|
/* Output (512-bytes, 64-byte per output) provisioned for 8 outputs */
|
|
struct cn10k_ml_model_metadata_output_section output1[MRVL_ML_NUM_INPUT_OUTPUT_1];
|
|
|
|
uint8_t reserved_2[1792];
|
|
|
|
/* Input (1536-bytes, 64-byte per input) provisioned for 24 inputs */
|
|
struct cn10k_ml_model_metadata_input_section input2[MRVL_ML_NUM_INPUT_OUTPUT_2];
|
|
|
|
/* Output (1536-bytes, 64-byte per output) provisioned for 24 outputs */
|
|
struct cn10k_ml_model_metadata_output_section output2[MRVL_ML_NUM_INPUT_OUTPUT_2];
|
|
|
|
/* Model data */
|
|
struct cn10k_ml_model_metadata_data_section data;
|
|
|
|
/* Hidden 16 bytes of magic code */
|
|
uint8_t reserved_3[16];
|
|
};
|
|
|
|
/* Model address structure */
|
|
struct cn10k_ml_layer_addr {
|
|
/* Base DMA address for load */
|
|
void *base_dma_addr_load;
|
|
|
|
/* Init section load address */
|
|
void *init_load_addr;
|
|
|
|
/* Main section load address */
|
|
void *main_load_addr;
|
|
|
|
/* Finish section load address */
|
|
void *finish_load_addr;
|
|
|
|
/* Weights and Bias base address */
|
|
void *wb_base_addr;
|
|
|
|
/* Weights and bias load address */
|
|
void *wb_load_addr;
|
|
|
|
/* Scratch base address */
|
|
void *scratch_base_addr;
|
|
|
|
/* Start tile */
|
|
uint8_t tile_start;
|
|
|
|
/* End tile */
|
|
uint8_t tile_end;
|
|
};
|
|
|
|
/* Model fast-path stats */
|
|
struct cn10k_ml_layer_xstats {
|
|
/* Total hardware latency, sum of all inferences */
|
|
uint64_t hw_latency_tot;
|
|
|
|
/* Minimum hardware latency */
|
|
uint64_t hw_latency_min;
|
|
|
|
/* Maximum hardware latency */
|
|
uint64_t hw_latency_max;
|
|
|
|
/* Total firmware latency, sum of all inferences */
|
|
uint64_t fw_latency_tot;
|
|
|
|
/* Minimum firmware latency */
|
|
uint64_t fw_latency_min;
|
|
|
|
/* Maximum firmware latency */
|
|
uint64_t fw_latency_max;
|
|
|
|
/* Total jobs dequeued */
|
|
uint64_t dequeued_count;
|
|
|
|
/* Hardware stats reset index */
|
|
uint64_t hw_reset_count;
|
|
|
|
/* Firmware stats reset index */
|
|
uint64_t fw_reset_count;
|
|
};
|
|
|
|
struct cn10k_ml_layer_data {
|
|
/* Model / Layer: metadata */
|
|
struct cn10k_ml_model_metadata metadata;
|
|
|
|
/* Layer: address structure */
|
|
struct cn10k_ml_layer_addr addr;
|
|
|
|
/* Layer: Tile and memory information object */
|
|
struct cn10k_ml_ocm_layer_map ocm_map;
|
|
|
|
/* Layer: Slow-path operations request pointer */
|
|
struct cnxk_ml_req *req;
|
|
|
|
/* Layer: Stats for burst ops */
|
|
struct cn10k_ml_layer_xstats *burst_xstats;
|
|
|
|
/* Layer: Stats for sync ops */
|
|
struct cn10k_ml_layer_xstats *sync_xstats;
|
|
};
|
|
|
|
struct cn10k_ml_model_data {
|
|
/* Model / Layer: metadata */
|
|
struct cn10k_ml_model_metadata metadata;
|
|
};
|
|
|
|
int cn10k_ml_model_metadata_check(uint8_t *buffer, uint64_t size);
|
|
void cn10k_ml_model_metadata_update(struct cn10k_ml_model_metadata *metadata);
|
|
void cn10k_ml_layer_addr_update(struct cnxk_ml_layer *layer, uint8_t *buffer,
|
|
uint8_t *base_dma_addr);
|
|
void cn10k_ml_layer_io_info_set(struct cnxk_ml_io_info *io_info,
|
|
struct cn10k_ml_model_metadata *metadata);
|
|
struct cnxk_ml_io_info *cn10k_ml_model_io_info_get(struct cnxk_ml_model *model, uint16_t layer_id);
|
|
int cn10k_ml_model_ocm_pages_count(struct cnxk_ml_dev *cnxk_mldev, struct cnxk_ml_layer *layer,
|
|
uint8_t *buffer, uint16_t *wb_pages, uint16_t *scratch_pages);
|
|
void cn10k_ml_model_info_set(struct cnxk_ml_dev *cnxk_mldev, struct cnxk_ml_model *model,
|
|
struct cnxk_ml_io_info *io_info,
|
|
struct cn10k_ml_model_metadata *metadata);
|
|
void cn10k_ml_layer_print(struct cnxk_ml_dev *cnxk_mldev, struct cnxk_ml_layer *layer, FILE *fp);
|
|
int cn10k_ml_model_get_layer_id(struct cnxk_ml_model *model, const char *layer_name,
|
|
uint16_t *layer_id);
|
|
|
|
#endif /* _CN10K_ML_MODEL_H_ */
|