/* SPDX-License-Identifier: BSD-3-Clause * Copyright (c) 2022 Marvell. */ #include #include #include #include "cnxk_ml_dev.h" #include "cnxk_ml_model.h" #include "cnxk_ml_ops.h" #include "cnxk_ml_xstats.h" /* ML model macros */ #define CN10K_ML_MODEL_MEMZONE_NAME "ml_cn10k_model_mz" /* ML layer macros */ #define CN10K_ML_LAYER_MEMZONE_NAME "ml_cn10k_layer_mz" /* ML Job descriptor flags */ #define ML_FLAGS_POLL_COMPL BIT(0) #define ML_FLAGS_SSO_COMPL BIT(1) /* Hardware non-fatal error subtype database */ static struct cnxk_ml_error_db ml_stype_db_hw_nf[] = { {ML_CN10K_FW_ERR_NOERR, "NO ERROR"}, {ML_CN10K_FW_ERR_UNLOAD_ID_NOT_FOUND, "UNLOAD MODEL ID NOT FOUND"}, {ML_CN10K_FW_ERR_LOAD_LUT_OVERFLOW, "LOAD LUT OVERFLOW"}, {ML_CN10K_FW_ERR_ID_IN_USE, "MODEL ID IN USE"}, {ML_CN10K_FW_ERR_INVALID_TILEMASK, "INVALID TILEMASK"}, {ML_CN10K_FW_ERR_RUN_LUT_OVERFLOW, "RUN LUT OVERFLOW"}, {ML_CN10K_FW_ERR_RUN_ID_NOT_FOUND, "RUN MODEL ID NOT FOUND"}, {ML_CN10K_FW_ERR_COMMAND_NOTSUP, "COMMAND NOT SUPPORTED"}, {ML_CN10K_FW_ERR_DDR_ADDR_RANGE, "DDR ADDRESS OUT OF RANGE"}, {ML_CN10K_FW_ERR_NUM_BATCHES_INVALID, "INVALID BATCHES"}, {ML_CN10K_FW_ERR_INSSYNC_TIMEOUT, "INSSYNC TIMEOUT"}, }; /* Driver error subtype database */ static struct cnxk_ml_error_db ml_stype_db_driver[] = { {ML_CN10K_DRIVER_ERR_NOERR, "NO ERROR"}, {ML_CN10K_DRIVER_ERR_UNKNOWN, "UNKNOWN ERROR"}, {ML_CN10K_DRIVER_ERR_EXCEPTION, "FW EXCEPTION"}, {ML_CN10K_DRIVER_ERR_FW_ERROR, "UNKNOWN FIRMWARE ERROR"}, }; __rte_hot void cn10k_ml_set_poll_addr(struct cnxk_ml_req *req) { req->status = &req->cn10k_req.status; } void cn10k_ml_qp_initialize(struct cnxk_ml_dev *cnxk_mldev, struct cnxk_ml_qp *qp) { uint64_t i; RTE_SET_USED(cnxk_mldev); /* Initialize job command */ for (i = 0; i < qp->nb_desc; i++) { memset(&qp->queue.reqs[i].cn10k_req.jd, 0, sizeof(struct cn10k_ml_jd)); qp->queue.reqs[i].cn10k_req.jcmd.w1.s.jobptr = PLT_U64_CAST(&qp->queue.reqs[i].cn10k_req.jd); } } static void cn10k_ml_prep_sp_job_descriptor(struct cnxk_ml_dev *cnxk_mldev, struct cnxk_ml_layer *layer, struct cnxk_ml_req *req, enum cn10k_ml_job_type job_type) { struct cn10k_ml_model_metadata *metadata; struct cn10k_ml_layer_addr *addr; struct cn10k_ml_dev *cn10k_mldev; cn10k_mldev = &cnxk_mldev->cn10k_mldev; metadata = &layer->glow.metadata; addr = &layer->glow.addr; memset(&req->cn10k_req.jd, 0, sizeof(struct cn10k_ml_jd)); req->cn10k_req.jd.hdr.jce.w0.u64 = 0; req->cn10k_req.jd.hdr.jce.w1.u64 = PLT_U64_CAST(&req->cn10k_req.status); req->cn10k_req.jd.hdr.model_id = layer->index; req->cn10k_req.jd.hdr.job_type = job_type; req->cn10k_req.jd.hdr.fp_flags = 0x0; req->cn10k_req.jd.hdr.result = roc_ml_addr_ap2mlip(&cn10k_mldev->roc, &req->cn10k_req.result); if (job_type == ML_CN10K_JOB_TYPE_MODEL_START) { if (!layer->glow.metadata.model.ocm_relocatable) req->cn10k_req.jd.hdr.sp_flags = ML_CN10K_SP_FLAGS_OCM_NONRELOCATABLE; else req->cn10k_req.jd.hdr.sp_flags = 0x0; req->cn10k_req.jd.hdr.sp_flags |= ML_CN10K_SP_FLAGS_EXTENDED_LOAD_JD; req->cn10k_req.jd.model_start.extended_args = PLT_U64_CAST( roc_ml_addr_ap2mlip(&cn10k_mldev->roc, &req->cn10k_req.extended_args)); req->cn10k_req.jd.model_start.model_dst_ddr_addr = PLT_U64_CAST(roc_ml_addr_ap2mlip(&cn10k_mldev->roc, addr->init_load_addr)); req->cn10k_req.jd.model_start.model_init_offset = 0x0; req->cn10k_req.jd.model_start.model_main_offset = metadata->init_model.file_size; req->cn10k_req.jd.model_start.model_finish_offset = metadata->init_model.file_size + metadata->main_model.file_size; req->cn10k_req.jd.model_start.model_init_size = metadata->init_model.file_size; req->cn10k_req.jd.model_start.model_main_size = metadata->main_model.file_size; req->cn10k_req.jd.model_start.model_finish_size = metadata->finish_model.file_size; req->cn10k_req.jd.model_start.model_wb_offset = metadata->init_model.file_size + metadata->main_model.file_size + metadata->finish_model.file_size; req->cn10k_req.jd.model_start.num_layers = metadata->model.num_layers; req->cn10k_req.jd.model_start.num_gather_entries = 0; req->cn10k_req.jd.model_start.num_scatter_entries = 0; req->cn10k_req.jd.model_start.tilemask = 0; /* Updated after reserving pages */ req->cn10k_req.jd.model_start.batch_size = layer->batch_size; req->cn10k_req.jd.model_start.ocm_wb_base_address = 0; /* Updated after reserving pages */ req->cn10k_req.jd.model_start.ocm_wb_range_start = metadata->model.ocm_wb_range_start; req->cn10k_req.jd.model_start.ocm_wb_range_end = metadata->model.ocm_wb_range_end; req->cn10k_req.jd.model_start.ddr_wb_base_address = PLT_U64_CAST(roc_ml_addr_ap2mlip( &cn10k_mldev->roc, PLT_PTR_ADD(addr->finish_load_addr, metadata->finish_model.file_size))); req->cn10k_req.jd.model_start.ddr_wb_range_start = metadata->model.ddr_wb_range_start; req->cn10k_req.jd.model_start.ddr_wb_range_end = metadata->model.ddr_wb_range_end; req->cn10k_req.jd.model_start.input.s.ddr_range_start = metadata->model.ddr_input_range_start; req->cn10k_req.jd.model_start.input.s.ddr_range_end = metadata->model.ddr_input_range_end; req->cn10k_req.jd.model_start.output.s.ddr_range_start = metadata->model.ddr_output_range_start; req->cn10k_req.jd.model_start.output.s.ddr_range_end = metadata->model.ddr_output_range_end; req->cn10k_req.extended_args.start.ddr_scratch_base_address = PLT_U64_CAST( roc_ml_addr_ap2mlip(&cn10k_mldev->roc, addr->scratch_base_addr)); req->cn10k_req.extended_args.start.ddr_scratch_range_start = metadata->model.ddr_scratch_range_start; req->cn10k_req.extended_args.start.ddr_scratch_range_end = metadata->model.ddr_scratch_range_end; } } static __rte_always_inline void cn10k_ml_prep_fp_job_descriptor(struct cnxk_ml_dev *cnxk_mldev, struct cnxk_ml_req *req, uint16_t index, void *input, void *output, uint16_t nb_batches) { struct cn10k_ml_dev *cn10k_mldev; cn10k_mldev = &cnxk_mldev->cn10k_mldev; req->cn10k_req.jd.hdr.jce.w0.u64 = 0; req->cn10k_req.jd.hdr.jce.w1.u64 = PLT_U64_CAST(req->status); req->cn10k_req.jd.hdr.model_id = index; req->cn10k_req.jd.hdr.job_type = ML_CN10K_JOB_TYPE_MODEL_RUN; req->cn10k_req.jd.hdr.fp_flags = ML_FLAGS_POLL_COMPL; req->cn10k_req.jd.hdr.sp_flags = 0x0; req->cn10k_req.jd.hdr.result = roc_ml_addr_ap2mlip(&cn10k_mldev->roc, &req->cn10k_req.result); req->cn10k_req.jd.model_run.input_ddr_addr = PLT_U64_CAST(roc_ml_addr_ap2mlip(&cn10k_mldev->roc, input)); req->cn10k_req.jd.model_run.output_ddr_addr = PLT_U64_CAST(roc_ml_addr_ap2mlip(&cn10k_mldev->roc, output)); req->cn10k_req.jd.model_run.num_batches = nb_batches; } static void cn10k_ml_xstats_layer_name_update(struct cnxk_ml_dev *cnxk_mldev, uint16_t model_id, uint16_t layer_id) { struct cnxk_ml_model *model; struct cnxk_ml_layer *layer; uint16_t rclk_freq; uint16_t sclk_freq; uint16_t stat_id; char suffix[8]; uint16_t i; model = cnxk_mldev->mldev->data->models[model_id]; layer = &model->layer[layer_id]; stat_id = cnxk_mldev->xstats.offset_for_layer[model_id][layer_id]; roc_clk_freq_get(&rclk_freq, &sclk_freq); if (sclk_freq == 0) strcpy(suffix, "cycles"); else strcpy(suffix, "ns"); /* Update xstat name based on layer name and sclk availability */ for (i = 0; i < RTE_DIM(layer_xstats); i++) { snprintf(cnxk_mldev->xstats.entries[stat_id].map.name, sizeof(cnxk_mldev->xstats.entries[stat_id].map.name), "%s-%s-%s", layer->glow.metadata.model.name, layer_xstats[i].name, suffix); stat_id++; } } void cn10k_ml_xstat_model_name_set(struct cnxk_ml_dev *cnxk_mldev, struct cnxk_ml_model *model, uint16_t stat_id, uint16_t entry, char *suffix) { snprintf(cnxk_mldev->xstats.entries[stat_id].map.name, sizeof(cnxk_mldev->xstats.entries[stat_id].map.name), "%s-%s-%s", model->glow.metadata.model.name, model_xstats[entry].name, suffix); } #define ML_AVG_FOREACH_QP(cnxk_mldev, layer, qp_id, str, value, count) \ do { \ value = 0; \ for (qp_id = 0; qp_id < cnxk_mldev->mldev->data->nb_queue_pairs; qp_id++) { \ value += layer->glow.burst_xstats[qp_id].str##_latency_tot; \ count += layer->glow.burst_xstats[qp_id].dequeued_count - \ layer->glow.burst_xstats[qp_id].str##_reset_count; \ } \ value += layer->glow.sync_xstats->str##_latency_tot; \ count += layer->glow.sync_xstats->dequeued_count - \ layer->glow.sync_xstats->str##_reset_count; \ if (count != 0) \ value = value / count; \ } while (0) #define ML_MIN_FOREACH_QP(cnxk_mldev, layer, qp_id, str, value, count) \ do { \ value = UINT64_MAX; \ for (qp_id = 0; qp_id < cnxk_mldev->mldev->data->nb_queue_pairs; qp_id++) { \ value = PLT_MIN(value, layer->glow.burst_xstats[qp_id].str##_latency_min); \ count += layer->glow.burst_xstats[qp_id].dequeued_count - \ layer->glow.burst_xstats[qp_id].str##_reset_count; \ } \ value = PLT_MIN(value, layer->glow.sync_xstats->str##_latency_min); \ count += layer->glow.sync_xstats->dequeued_count - \ layer->glow.sync_xstats->str##_reset_count; \ if (count == 0) \ value = 0; \ } while (0) #define ML_MAX_FOREACH_QP(cnxk_mldev, layer, qp_id, str, value, count) \ do { \ value = 0; \ for (qp_id = 0; qp_id < cnxk_mldev->mldev->data->nb_queue_pairs; qp_id++) { \ value = PLT_MAX(value, layer->glow.burst_xstats[qp_id].str##_latency_max); \ count += layer->glow.burst_xstats[qp_id].dequeued_count - \ layer->glow.burst_xstats[qp_id].str##_reset_count; \ } \ value = PLT_MAX(value, layer->glow.sync_xstats->str##_latency_max); \ count += layer->glow.sync_xstats->dequeued_count - \ layer->glow.sync_xstats->str##_reset_count; \ if (count == 0) \ value = 0; \ } while (0) uint64_t cn10k_ml_model_xstat_get(struct cnxk_ml_dev *cnxk_mldev, struct cnxk_ml_layer *layer, enum cnxk_ml_xstats_type type) { uint64_t count = 0; uint64_t value = 0; uint32_t qp_id; switch (type) { case avg_hw_latency: ML_AVG_FOREACH_QP(cnxk_mldev, layer, qp_id, hw, value, count); break; case min_hw_latency: ML_MIN_FOREACH_QP(cnxk_mldev, layer, qp_id, hw, value, count); break; case max_hw_latency: ML_MAX_FOREACH_QP(cnxk_mldev, layer, qp_id, hw, value, count); break; case avg_fw_latency: ML_AVG_FOREACH_QP(cnxk_mldev, layer, qp_id, fw, value, count); break; case min_fw_latency: ML_MIN_FOREACH_QP(cnxk_mldev, layer, qp_id, fw, value, count); break; case max_fw_latency: ML_MAX_FOREACH_QP(cnxk_mldev, layer, qp_id, fw, value, count); break; default: value = 0; } return value; } static int cn10k_ml_cache_model_data(struct cnxk_ml_dev *cnxk_mldev, struct cnxk_ml_layer *layer) { struct cn10k_ml_layer_xstats *xstats; char str[RTE_MEMZONE_NAMESIZE]; const struct plt_memzone *mz; uint64_t isize = 0; uint64_t osize = 0; int ret = 0; /* Create input and output buffers. */ isize = layer->info.total_input_sz_q; osize = layer->info.total_output_sz_q; snprintf(str, RTE_MEMZONE_NAMESIZE, "%s_%u", "ml_dummy_io", layer->index); mz = plt_memzone_reserve_aligned(str, isize + osize, 0, ML_CN10K_ALIGN_SIZE); if (mz == NULL) return -ENOMEM; memset(mz->addr, 0, isize + osize); memset(layer->glow.req, 0, sizeof(struct cnxk_ml_req)); ret = cn10k_ml_inference_sync(cnxk_mldev, layer->index, mz->addr, PLT_PTR_ADD(mz->addr, isize), 1); plt_memzone_free(mz); /* Reset sync xstats. */ xstats = layer->glow.sync_xstats; xstats->hw_latency_tot = 0; xstats->hw_latency_min = UINT64_MAX; xstats->hw_latency_max = 0; xstats->fw_latency_tot = 0; xstats->fw_latency_min = UINT64_MAX; xstats->fw_latency_max = 0; xstats->dequeued_count = 0; return ret; } int cn10k_ml_dev_info_get(struct cnxk_ml_dev *cnxk_mldev, struct rte_ml_dev_info *dev_info) { struct cn10k_ml_dev *cn10k_mldev; cn10k_mldev = &cnxk_mldev->cn10k_mldev; if (cn10k_mldev->hw_queue_lock) dev_info->max_queue_pairs = ML_CN10K_MAX_QP_PER_DEVICE_SL; else dev_info->max_queue_pairs = ML_CN10K_MAX_QP_PER_DEVICE_LF; dev_info->max_desc = ML_CN10K_MAX_DESC_PER_QP; dev_info->max_io = ML_CN10K_MAX_INPUT_OUTPUT; dev_info->max_segments = ML_CN10K_MAX_SEGMENTS; dev_info->align_size = ML_CN10K_ALIGN_SIZE; return 0; } int cn10k_ml_dev_configure(struct cnxk_ml_dev *cnxk_mldev, const struct rte_ml_dev_config *conf) { struct cn10k_ml_dev *cn10k_mldev; struct cn10k_ml_ocm *ocm; uint16_t tile_id; RTE_SET_USED(conf); cn10k_mldev = &cnxk_mldev->cn10k_mldev; ocm = &cn10k_mldev->ocm; ocm->num_tiles = ML_CN10K_OCM_NUMTILES; ocm->size_per_tile = ML_CN10K_OCM_TILESIZE; ocm->page_size = cn10k_mldev->ocm_page_size; ocm->num_pages = ocm->size_per_tile / ocm->page_size; ocm->mask_words = ocm->num_pages / (8 * sizeof(uint8_t)); /* Allocate memory for ocm_mask */ ocm->ocm_mask = rte_zmalloc("ocm_mask", ocm->mask_words * ocm->num_tiles, RTE_CACHE_LINE_SIZE); if (ocm->ocm_mask == NULL) { plt_err("Unable to allocate memory for OCM mask"); return -ENOMEM; } for (tile_id = 0; tile_id < ocm->num_tiles; tile_id++) { ocm->tile_ocm_info[tile_id].ocm_mask = ocm->ocm_mask + tile_id * ocm->mask_words; ocm->tile_ocm_info[tile_id].last_wb_page = -1; } rte_spinlock_init(&ocm->lock); /* Set JCMDQ enqueue function */ if (cn10k_mldev->hw_queue_lock == 1) cn10k_mldev->ml_jcmdq_enqueue = roc_ml_jcmdq_enqueue_sl; else cn10k_mldev->ml_jcmdq_enqueue = roc_ml_jcmdq_enqueue_lf; return 0; } int cn10k_ml_dev_close(struct cnxk_ml_dev *cnxk_mldev) { struct cn10k_ml_dev *cn10k_mldev; cn10k_mldev = &cnxk_mldev->cn10k_mldev; /* Release ocm_mask memory */ rte_free(cn10k_mldev->ocm.ocm_mask); /* Unload firmware */ cn10k_ml_fw_unload(cnxk_mldev); /* Clear scratch registers */ roc_ml_reg_write64(&cn10k_mldev->roc, 0, ML_SCRATCH_WORK_PTR); roc_ml_reg_write64(&cn10k_mldev->roc, 0, ML_SCRATCH_FW_CTRL); roc_ml_reg_write64(&cn10k_mldev->roc, 0, ML_SCRATCH_DBG_BUFFER_HEAD_C0); roc_ml_reg_write64(&cn10k_mldev->roc, 0, ML_SCRATCH_DBG_BUFFER_TAIL_C0); roc_ml_reg_write64(&cn10k_mldev->roc, 0, ML_SCRATCH_DBG_BUFFER_HEAD_C1); roc_ml_reg_write64(&cn10k_mldev->roc, 0, ML_SCRATCH_DBG_BUFFER_TAIL_C1); /* Reset ML_MLR_BASE */ roc_ml_reg_write64(&cn10k_mldev->roc, 0, ML_MLR_BASE); plt_ml_dbg("ML_MLR_BASE = 0x%016lx", roc_ml_reg_read64(&cn10k_mldev->roc, ML_MLR_BASE)); return 0; } int cn10k_ml_dev_start(struct cnxk_ml_dev *cnxk_mldev) { struct cn10k_ml_dev *cn10k_mldev; uint64_t reg_val64; cn10k_mldev = &cnxk_mldev->cn10k_mldev; reg_val64 = roc_ml_reg_read64(&cn10k_mldev->roc, ML_CFG); reg_val64 |= ROC_ML_CFG_ENA; roc_ml_reg_write64(&cn10k_mldev->roc, reg_val64, ML_CFG); plt_ml_dbg("ML_CFG => 0x%016lx", roc_ml_reg_read64(&cn10k_mldev->roc, ML_CFG)); return 0; } int cn10k_ml_dev_stop(struct cnxk_ml_dev *cnxk_mldev) { struct cn10k_ml_dev *cn10k_mldev; uint64_t reg_val64; cn10k_mldev = &cnxk_mldev->cn10k_mldev; reg_val64 = roc_ml_reg_read64(&cn10k_mldev->roc, ML_CFG); reg_val64 &= ~ROC_ML_CFG_ENA; roc_ml_reg_write64(&cn10k_mldev->roc, reg_val64, ML_CFG); plt_ml_dbg("ML_CFG => 0x%016lx", roc_ml_reg_read64(&cn10k_mldev->roc, ML_CFG)); return 0; } int cn10k_ml_dev_dump(struct cnxk_ml_dev *cnxk_mldev, FILE *fp) { struct cn10k_ml_dev *cn10k_mldev; struct cn10k_ml_fw *fw; uint32_t head_loc; uint32_t tail_loc; uint32_t bufsize; char *head_ptr; int core_id; cn10k_mldev = &cnxk_mldev->cn10k_mldev; fw = &cn10k_mldev->fw; /* Dump OCM state */ cn10k_ml_ocm_print(cnxk_mldev, fp); if (roc_env_is_asim()) return 0; /* Dump debug buffer */ for (core_id = 0; core_id <= 1; core_id++) { bufsize = fw->req->cn10k_req.jd.fw_load.debug.debug_buffer_size; if (core_id == 0) { head_loc = roc_ml_reg_read64(&cn10k_mldev->roc, ML_SCRATCH_DBG_BUFFER_HEAD_C0); tail_loc = roc_ml_reg_read64(&cn10k_mldev->roc, ML_SCRATCH_DBG_BUFFER_TAIL_C0); head_ptr = PLT_PTR_CAST(fw->req->cn10k_req.jd.fw_load.debug.core0_debug_ptr); head_ptr = roc_ml_addr_mlip2ap(&cn10k_mldev->roc, head_ptr); } else { head_loc = roc_ml_reg_read64(&cn10k_mldev->roc, ML_SCRATCH_DBG_BUFFER_HEAD_C1); tail_loc = roc_ml_reg_read64(&cn10k_mldev->roc, ML_SCRATCH_DBG_BUFFER_TAIL_C1); head_ptr = PLT_PTR_CAST(fw->req->cn10k_req.jd.fw_load.debug.core1_debug_ptr); head_ptr = roc_ml_addr_mlip2ap(&cn10k_mldev->roc, head_ptr); } if (head_loc < tail_loc) { fprintf(fp, "%.*s\n", tail_loc - head_loc, &head_ptr[head_loc]); } else if (head_loc >= tail_loc + 1) { fprintf(fp, "%.*s\n", bufsize - tail_loc, &head_ptr[head_loc]); fprintf(fp, "%.*s\n", tail_loc, &head_ptr[0]); } } /* Dump exception info */ for (core_id = 0; core_id <= 1; core_id++) { bufsize = fw->req->cn10k_req.jd.fw_load.debug.exception_state_size; if ((core_id == 0) && (roc_ml_reg_read64(&cn10k_mldev->roc, ML_SCRATCH_EXCEPTION_SP_C0) != 0)) { head_ptr = PLT_PTR_CAST( fw->req->cn10k_req.jd.fw_load.debug.core0_exception_buffer); fprintf(fp, "ML_SCRATCH_EXCEPTION_SP_C0 = 0x%016lx", roc_ml_reg_read64(&cn10k_mldev->roc, ML_SCRATCH_EXCEPTION_SP_C0)); head_ptr = roc_ml_addr_mlip2ap(&cn10k_mldev->roc, head_ptr); fprintf(fp, "%.*s", bufsize, head_ptr); } else if ((core_id == 1) && (roc_ml_reg_read64(&cn10k_mldev->roc, ML_SCRATCH_EXCEPTION_SP_C1) != 0)) { head_ptr = PLT_PTR_CAST( fw->req->cn10k_req.jd.fw_load.debug.core1_exception_buffer); fprintf(fp, "ML_SCRATCH_EXCEPTION_SP_C1 = 0x%016lx", roc_ml_reg_read64(&cn10k_mldev->roc, ML_SCRATCH_EXCEPTION_SP_C1)); head_ptr = roc_ml_addr_mlip2ap(&cn10k_mldev->roc, head_ptr); fprintf(fp, "%.*s", bufsize, head_ptr); } } return 0; } int cn10k_ml_dev_selftest(struct cnxk_ml_dev *cnxk_mldev) { struct cn10k_ml_dev *cn10k_mldev; const struct plt_memzone *mz; struct cnxk_ml_req *req; uint64_t timeout_cycle; bool timeout; int ret; cn10k_mldev = &cnxk_mldev->cn10k_mldev; mz = plt_memzone_reserve_aligned("dev_selftest", sizeof(struct cnxk_ml_req), 0, ML_CN10K_ALIGN_SIZE); if (mz == NULL) { plt_err("Could not allocate reserved memzone"); return -ENOMEM; } req = mz->addr; /* Prepare load completion structure */ memset(&req->cn10k_req.jd, 0, sizeof(struct cn10k_ml_jd)); req->cn10k_req.jd.hdr.jce.w1.u64 = PLT_U64_CAST(&req->cn10k_req.status); req->cn10k_req.jd.hdr.job_type = ML_CN10K_JOB_TYPE_FIRMWARE_SELFTEST; req->cn10k_req.jd.hdr.result = roc_ml_addr_ap2mlip(&cn10k_mldev->roc, &req->cn10k_req.result); req->cn10k_req.jd.fw_load.flags = cn10k_ml_fw_flags_get(&cn10k_mldev->fw); plt_write64(ML_CNXK_POLL_JOB_START, &req->cn10k_req.status); plt_wmb(); /* Enqueue firmware selftest request through scratch registers */ timeout = true; timeout_cycle = plt_tsc_cycles() + ML_CNXK_CMD_TIMEOUT * plt_tsc_hz(); roc_ml_scratch_enqueue(&cn10k_mldev->roc, &req->cn10k_req.jd); plt_rmb(); do { if (roc_ml_scratch_is_done_bit_set(&cn10k_mldev->roc) && (plt_read64(&req->cn10k_req.status) == ML_CNXK_POLL_JOB_FINISH)) { timeout = false; break; } } while (plt_tsc_cycles() < timeout_cycle); /* Check firmware selftest status, clean-up and exit */ ret = 0; if (timeout) { ret = -ETIME; } else { if (req->cn10k_req.result.error_code != 0) ret = -1; } plt_memzone_free(mz); return ret; } int cn10k_ml_layer_load(void *device, uint16_t model_id, const char *layer_name, uint8_t *buffer, size_t size, uint16_t *index) { struct cn10k_ml_model_metadata *metadata; struct cnxk_ml_dev *cnxk_mldev; struct cnxk_ml_model *model; struct cnxk_ml_layer *layer; char str[RTE_MEMZONE_NAMESIZE]; const struct plt_memzone *mz; size_t layer_object_size = 0; size_t layer_scratch_size; size_t layer_xstats_size; uint8_t *base_dma_addr; uint16_t scratch_pages; uint16_t layer_id; uint16_t wb_pages; uint64_t mz_size; uint16_t idx; int qp_id; int ret; PLT_SET_USED(size); cnxk_mldev = (struct cnxk_ml_dev *)device; if (cnxk_mldev == NULL) { plt_err("Invalid device = %p", device); return -EINVAL; } model = cnxk_mldev->mldev->data->models[model_id]; if (model == NULL) { plt_err("Invalid model_id = %u", model_id); return -EINVAL; } ret = cn10k_ml_model_get_layer_id(model, layer_name, &layer_id); if (ret != 0) return ret; layer = &model->layer[layer_id]; ret = cn10k_ml_model_metadata_check(buffer, size); if (ret != 0) return ret; /* Get index */ for (idx = 0; idx < cnxk_mldev->max_nb_layers; idx++) { if (!cnxk_mldev->index_map[idx].active) { layer->index = idx; break; } } if (idx >= cnxk_mldev->max_nb_layers) { plt_err("No slots available for model layers, model_id = %u, layer_id = %u", model->model_id, layer_id); return -1; } layer->model = model; /* Get WB and scratch pages, check if model can be loaded. */ ret = cn10k_ml_model_ocm_pages_count(cnxk_mldev, layer, buffer, &wb_pages, &scratch_pages); if (ret < 0) return ret; /* Compute layer memzone size */ metadata = (struct cn10k_ml_model_metadata *)buffer; layer_object_size = metadata->init_model.file_size + metadata->main_model.file_size + metadata->finish_model.file_size + metadata->weights_bias.file_size; layer_object_size = PLT_ALIGN_CEIL(layer_object_size, ML_CN10K_ALIGN_SIZE); layer_scratch_size = PLT_ALIGN_CEIL(metadata->model.ddr_scratch_range_end - metadata->model.ddr_scratch_range_start + 1, ML_CN10K_ALIGN_SIZE); layer_xstats_size = (cnxk_mldev->mldev->data->nb_queue_pairs + 1) * sizeof(struct cn10k_ml_layer_xstats); /* Allocate memzone for model data */ mz_size = layer_object_size + layer_scratch_size + PLT_ALIGN_CEIL(sizeof(struct cnxk_ml_req), ML_CN10K_ALIGN_SIZE) + layer_xstats_size; snprintf(str, RTE_MEMZONE_NAMESIZE, "%s_%u_%u", CN10K_ML_LAYER_MEMZONE_NAME, model->model_id, layer_id); mz = plt_memzone_reserve_aligned(str, mz_size, 0, ML_CN10K_ALIGN_SIZE); if (!mz) { plt_err("plt_memzone_reserve failed : %s", str); return -ENOMEM; } /* Copy metadata to internal buffer */ rte_memcpy(&layer->glow.metadata, buffer, sizeof(struct cn10k_ml_model_metadata)); cn10k_ml_model_metadata_update(&layer->glow.metadata); /* Set layer name */ rte_memcpy(layer->name, layer->glow.metadata.model.name, MRVL_ML_MODEL_NAME_LEN); /* Enable support for batch_size of 256 */ if (layer->glow.metadata.model.batch_size == 0) layer->batch_size = 256; else layer->batch_size = layer->glow.metadata.model.batch_size; /* Set DMA base address */ base_dma_addr = mz->addr; cn10k_ml_layer_addr_update(layer, buffer, base_dma_addr); /* Set scratch base address */ layer->glow.addr.scratch_base_addr = PLT_PTR_ADD(base_dma_addr, layer_object_size); /* Update internal I/O data structure */ cn10k_ml_layer_io_info_set(&layer->info, &layer->glow.metadata); /* Initialize model_mem_map */ memset(&layer->glow.ocm_map, 0, sizeof(struct cn10k_ml_ocm_layer_map)); layer->glow.ocm_map.ocm_reserved = false; layer->glow.ocm_map.tilemask = 0; layer->glow.ocm_map.wb_page_start = -1; layer->glow.ocm_map.wb_pages = wb_pages; layer->glow.ocm_map.scratch_pages = scratch_pages; /* Set slow-path request address and state */ layer->glow.req = PLT_PTR_ADD(mz->addr, layer_object_size + layer_scratch_size); /* Reset burst and sync stats */ layer->glow.burst_xstats = PLT_PTR_ADD( layer->glow.req, PLT_ALIGN_CEIL(sizeof(struct cnxk_ml_req), ML_CN10K_ALIGN_SIZE)); for (qp_id = 0; qp_id < cnxk_mldev->mldev->data->nb_queue_pairs + 1; qp_id++) { layer->glow.burst_xstats[qp_id].hw_latency_tot = 0; layer->glow.burst_xstats[qp_id].hw_latency_min = UINT64_MAX; layer->glow.burst_xstats[qp_id].hw_latency_max = 0; layer->glow.burst_xstats[qp_id].fw_latency_tot = 0; layer->glow.burst_xstats[qp_id].fw_latency_min = UINT64_MAX; layer->glow.burst_xstats[qp_id].fw_latency_max = 0; layer->glow.burst_xstats[qp_id].hw_reset_count = 0; layer->glow.burst_xstats[qp_id].fw_reset_count = 0; layer->glow.burst_xstats[qp_id].dequeued_count = 0; } layer->glow.sync_xstats = PLT_PTR_ADD(layer->glow.burst_xstats, cnxk_mldev->mldev->data->nb_queue_pairs * sizeof(struct cn10k_ml_layer_xstats)); /* Update xstats names */ cn10k_ml_xstats_layer_name_update(cnxk_mldev, model_id, layer_id); layer->state = ML_CNXK_LAYER_STATE_LOADED; cnxk_mldev->index_map[idx].model_id = model->model_id; cnxk_mldev->index_map[idx].layer_id = layer_id; cnxk_mldev->index_map[idx].active = true; *index = idx; return 0; } int cn10k_ml_model_load(struct cnxk_ml_dev *cnxk_mldev, struct rte_ml_model_params *params, struct cnxk_ml_model *model) { struct cnxk_ml_layer *layer; int ret; /* Metadata check */ ret = cn10k_ml_model_metadata_check(params->addr, params->size); if (ret != 0) return ret; /* Set model sub type */ model->subtype = ML_CNXK_MODEL_SUBTYPE_GLOW_MRVL; /* Copy metadata to internal buffer */ rte_memcpy(&model->glow.metadata, params->addr, sizeof(struct cn10k_ml_model_metadata)); cn10k_ml_model_metadata_update(&model->glow.metadata); /* Set model name */ rte_memcpy(model->name, (char *)model->glow.metadata.model.name, 64); /* Enable support for batch_size of 256 */ if (model->glow.metadata.model.batch_size == 0) model->batch_size = 256; else model->batch_size = model->glow.metadata.model.batch_size; /* Since the number of layers that the driver would be handling for glow models is * always 1. consider the entire model as a model with single layer. This would * ignore the num_layers from metadata. */ model->nb_layers = 1; /* Load layer and get the index */ layer = &model->layer[0]; layer->type = ML_CNXK_LAYER_TYPE_MRVL; ret = cn10k_ml_layer_load(cnxk_mldev, model->model_id, NULL, params->addr, params->size, &layer->index); if (ret != 0) { plt_err("Model layer load failed: model_id = %u, layer_id = %u", model->model_id, 0); return ret; } cn10k_ml_model_info_set(cnxk_mldev, model, &model->layer[0].info, &model->glow.metadata); /* Set fast-path functions */ model->enqueue_single = cn10k_ml_enqueue_single; model->result_update = cn10k_ml_result_update; model->set_error_code = cn10k_ml_set_error_code; model->set_poll_addr = cn10k_ml_set_poll_addr; return 0; } int cn10k_ml_layer_unload(void *device, uint16_t model_id, const char *layer_name) { struct cnxk_ml_dev *cnxk_mldev; struct cnxk_ml_model *model; struct cnxk_ml_layer *layer; char str[RTE_MEMZONE_NAMESIZE]; uint16_t layer_id; int ret; cnxk_mldev = (struct cnxk_ml_dev *)device; if (cnxk_mldev == NULL) { plt_err("Invalid device = %p", device); return -EINVAL; } model = cnxk_mldev->mldev->data->models[model_id]; if (model == NULL) { plt_err("Invalid model_id = %u", model_id); return -EINVAL; } ret = cn10k_ml_model_get_layer_id(model, layer_name, &layer_id); if (ret != 0) return ret; layer = &model->layer[layer_id]; snprintf(str, RTE_MEMZONE_NAMESIZE, "%s_%u_%u", CN10K_ML_LAYER_MEMZONE_NAME, model->model_id, layer_id); ret = plt_memzone_free(plt_memzone_lookup(str)); layer->state = ML_CNXK_LAYER_STATE_UNKNOWN; cnxk_mldev->index_map[layer->index].active = false; return ret; } int cn10k_ml_model_unload(struct cnxk_ml_dev *cnxk_mldev, struct cnxk_ml_model *model) { return cn10k_ml_layer_unload(cnxk_mldev, model->model_id, NULL); } int cn10k_ml_layer_start(void *device, uint16_t model_id, const char *layer_name) { struct cn10k_ml_dev *cn10k_mldev; struct cnxk_ml_dev *cnxk_mldev; struct cnxk_ml_model *model; struct cnxk_ml_layer *layer; struct cn10k_ml_ocm *ocm; struct cnxk_ml_req *req; uint16_t layer_id; bool job_enqueued; bool job_dequeued; uint8_t num_tiles; uint64_t tilemask; int wb_page_start; int tile_start; int tile_end; bool locked; int ret = 0; cnxk_mldev = (struct cnxk_ml_dev *)device; if (cnxk_mldev == NULL) { plt_err("Invalid device = %p", device); return -EINVAL; } model = cnxk_mldev->mldev->data->models[model_id]; if (model == NULL) { plt_err("Invalid model_id = %u", model_id); return -EINVAL; } ret = cn10k_ml_model_get_layer_id(model, layer_name, &layer_id); if (ret != 0) return ret; layer = &model->layer[layer_id]; cn10k_mldev = &cnxk_mldev->cn10k_mldev; ocm = &cn10k_mldev->ocm; /* Prepare JD */ req = layer->glow.req; cn10k_ml_prep_sp_job_descriptor(cnxk_mldev, layer, req, ML_CN10K_JOB_TYPE_MODEL_START); req->cn10k_req.result.error_code = 0x0; req->cn10k_req.result.user_ptr = NULL; plt_write64(ML_CNXK_POLL_JOB_START, &req->cn10k_req.status); plt_wmb(); num_tiles = layer->glow.metadata.model.tile_end - layer->glow.metadata.model.tile_start + 1; locked = false; while (!locked) { if (plt_spinlock_trylock(&model->lock) != 0) { if (layer->state == ML_CNXK_LAYER_STATE_STARTED) { plt_ml_dbg("Layer already started, model_id = %u, layer_id = %u", model->model_id, layer_id); plt_spinlock_unlock(&model->lock); return 1; } if (layer->state == ML_CNXK_LAYER_STATE_JOB_ACTIVE) { plt_err("A slow-path job is active for the model_id = %u", model->model_id); plt_spinlock_unlock(&model->lock); return -EBUSY; } layer->state = ML_CNXK_LAYER_STATE_JOB_ACTIVE; plt_spinlock_unlock(&model->lock); locked = true; } } while (!layer->glow.ocm_map.ocm_reserved) { if (plt_spinlock_trylock(&ocm->lock) != 0) { wb_page_start = cn10k_ml_ocm_tilemask_find( cnxk_mldev, num_tiles, layer->glow.ocm_map.wb_pages, layer->glow.ocm_map.scratch_pages, &tilemask); if (wb_page_start == -1) { plt_err("Free pages not available on OCM tiles"); plt_err("Failed to start layer, model_id = %u, layer_id = %u", model->model_id, layer_id); plt_spinlock_unlock(&ocm->lock); return -ENOMEM; } layer->glow.ocm_map.tilemask = tilemask; layer->glow.ocm_map.wb_page_start = wb_page_start; cn10k_ml_ocm_reserve_pages( cnxk_mldev, model->model_id, layer_id, layer->glow.ocm_map.tilemask, layer->glow.ocm_map.wb_page_start, layer->glow.ocm_map.wb_pages, layer->glow.ocm_map.scratch_pages); layer->glow.ocm_map.ocm_reserved = true; plt_spinlock_unlock(&ocm->lock); } } /* Update JD */ cn10k_ml_ocm_tilecount(layer->glow.ocm_map.tilemask, &tile_start, &tile_end); req->cn10k_req.jd.model_start.tilemask = GENMASK_ULL(tile_end, tile_start); req->cn10k_req.jd.model_start.ocm_wb_base_address = layer->glow.ocm_map.wb_page_start * ocm->page_size; job_enqueued = false; job_dequeued = false; do { if (!job_enqueued) { req->timeout = plt_tsc_cycles() + ML_CNXK_CMD_TIMEOUT * plt_tsc_hz(); job_enqueued = roc_ml_scratch_enqueue(&cn10k_mldev->roc, &req->cn10k_req.jd); } if (job_enqueued && !job_dequeued) job_dequeued = roc_ml_scratch_dequeue(&cn10k_mldev->roc, &req->cn10k_req.jd); if (job_dequeued) break; } while (plt_tsc_cycles() < req->timeout); if (job_dequeued) { if (plt_read64(&req->cn10k_req.status) == ML_CNXK_POLL_JOB_FINISH) { if (req->cn10k_req.result.error_code == 0) ret = 0; else ret = -1; } } else { /* Reset scratch registers */ roc_ml_scratch_queue_reset(&cn10k_mldev->roc); ret = -ETIME; } locked = false; while (!locked) { if (plt_spinlock_trylock(&model->lock) != 0) { if (ret == 0) layer->state = ML_CNXK_LAYER_STATE_STARTED; else layer->state = ML_CNXK_LAYER_STATE_UNKNOWN; plt_spinlock_unlock(&model->lock); locked = true; } } if (layer->state == ML_CNXK_LAYER_STATE_UNKNOWN) { while (layer->glow.ocm_map.ocm_reserved) { if (plt_spinlock_trylock(&ocm->lock) != 0) { cn10k_ml_ocm_free_pages(cnxk_mldev, model->model_id, layer_id); layer->glow.ocm_map.ocm_reserved = false; layer->glow.ocm_map.tilemask = 0x0; plt_spinlock_unlock(&ocm->lock); } } } if (ret < 0) { cn10k_ml_layer_stop(device, model_id, layer_name); } else { if (cn10k_mldev->cache_model_data && model->type == ML_CNXK_MODEL_TYPE_GLOW) ret = cn10k_ml_cache_model_data(cnxk_mldev, layer); } return ret; } int cn10k_ml_model_start(struct cnxk_ml_dev *cnxk_mldev, struct cnxk_ml_model *model) { struct cnxk_ml_layer *layer; int ret; layer = &model->layer[0]; ret = cn10k_ml_layer_start(cnxk_mldev, model->model_id, layer->name); if (ret != 0) { plt_err("CN10K Model start failed, model_id = %u, error = %d", model->model_id, ret); return ret; } cnxk_mldev->nb_models_started++; model->state = ML_CNXK_MODEL_STATE_STARTED; return 0; } int cn10k_ml_layer_stop(void *device, uint16_t model_id, const char *layer_name) { struct cn10k_ml_dev *cn10k_mldev; struct cnxk_ml_dev *cnxk_mldev; struct cnxk_ml_model *model; struct cnxk_ml_layer *layer; struct cn10k_ml_ocm *ocm; struct cnxk_ml_req *req; uint16_t layer_id; bool job_enqueued; bool job_dequeued; bool locked; int ret = 0; cnxk_mldev = (struct cnxk_ml_dev *)device; if (cnxk_mldev == NULL) { plt_err("Invalid device = %p", device); return -EINVAL; } model = cnxk_mldev->mldev->data->models[model_id]; if (model == NULL) { plt_err("Invalid model_id = %u", model_id); return -EINVAL; } ret = cn10k_ml_model_get_layer_id(model, layer_name, &layer_id); if (ret != 0) return ret; layer = &model->layer[layer_id]; cn10k_mldev = &cnxk_mldev->cn10k_mldev; ocm = &cn10k_mldev->ocm; /* Prepare JD */ req = layer->glow.req; cn10k_ml_prep_sp_job_descriptor(cnxk_mldev, layer, req, ML_CN10K_JOB_TYPE_MODEL_STOP); req->cn10k_req.result.error_code = 0x0; req->cn10k_req.result.user_ptr = NULL; plt_write64(ML_CNXK_POLL_JOB_START, &req->cn10k_req.status); plt_wmb(); locked = false; while (!locked) { if (plt_spinlock_trylock(&model->lock) != 0) { if (layer->state == ML_CNXK_LAYER_STATE_LOADED) { plt_ml_dbg("Layer not started, model_id = %u, layer_id = %u", model->model_id, layer_id); plt_spinlock_unlock(&model->lock); return 1; } if (layer->state == ML_CNXK_LAYER_STATE_JOB_ACTIVE) { plt_err("A slow-path job is active for the layer, model_id = %u, layer_id = %u", model->model_id, layer_id); plt_spinlock_unlock(&model->lock); return -EBUSY; } layer->state = ML_CNXK_LAYER_STATE_JOB_ACTIVE; plt_spinlock_unlock(&model->lock); locked = true; } } while (layer->glow.ocm_map.ocm_reserved) { if (plt_spinlock_trylock(&ocm->lock) != 0) { cn10k_ml_ocm_free_pages(cnxk_mldev, model->model_id, layer_id); layer->glow.ocm_map.ocm_reserved = false; layer->glow.ocm_map.tilemask = 0x0; plt_spinlock_unlock(&ocm->lock); } } job_enqueued = false; job_dequeued = false; do { if (!job_enqueued) { req->timeout = plt_tsc_cycles() + ML_CNXK_CMD_TIMEOUT * plt_tsc_hz(); job_enqueued = roc_ml_scratch_enqueue(&cn10k_mldev->roc, &req->cn10k_req.jd); } if (job_enqueued && !job_dequeued) job_dequeued = roc_ml_scratch_dequeue(&cn10k_mldev->roc, &req->cn10k_req.jd); if (job_dequeued) break; } while (plt_tsc_cycles() < req->timeout); if (job_dequeued) { if (plt_read64(&req->cn10k_req.status) == ML_CNXK_POLL_JOB_FINISH) { if (req->cn10k_req.result.error_code == 0x0) ret = 0; else ret = -1; } } else { roc_ml_scratch_queue_reset(&cn10k_mldev->roc); ret = -ETIME; } locked = false; while (!locked) { if (plt_spinlock_trylock(&model->lock) != 0) { if (ret == 0) layer->state = ML_CNXK_LAYER_STATE_LOADED; else layer->state = ML_CNXK_LAYER_STATE_UNKNOWN; plt_spinlock_unlock(&model->lock); locked = true; } } return ret; } int cn10k_ml_model_stop(struct cnxk_ml_dev *cnxk_mldev, struct cnxk_ml_model *model) { struct cnxk_ml_layer *layer; int ret; layer = &model->layer[0]; ret = cn10k_ml_layer_stop(cnxk_mldev, model->model_id, layer->name); if (ret != 0) { plt_err("CN10K Model stop failed, model_id = %u, error = %d", model->model_id, ret); return ret; } cnxk_mldev->nb_models_stopped++; model->state = ML_CNXK_MODEL_STATE_LOADED; return 0; } int cn10k_ml_model_params_update(struct cnxk_ml_dev *cnxk_mldev, struct cnxk_ml_model *model, void *buffer) { struct cnxk_ml_layer *layer; RTE_SET_USED(cnxk_mldev); if (model->state == ML_CNXK_MODEL_STATE_UNKNOWN) return -1; else if (model->state != ML_CNXK_MODEL_STATE_LOADED) return -EBUSY; layer = &model->layer[0]; /* Update model weights & bias */ rte_memcpy(layer->glow.addr.wb_load_addr, buffer, layer->glow.metadata.weights_bias.file_size); return 0; } __rte_hot void cn10k_ml_result_update(struct cnxk_ml_dev *cnxk_mldev, int qp_id, void *request) { union cn10k_ml_error_code *error_code; struct cn10k_ml_layer_xstats *xstats; struct cn10k_ml_dev *cn10k_mldev; struct cn10k_ml_result *result; struct cnxk_ml_model *model; struct cnxk_ml_layer *layer; struct cnxk_ml_req *req; struct cnxk_ml_qp *qp; struct rte_ml_op *op; uint64_t hw_latency; uint64_t fw_latency; uint16_t model_id; uint16_t layer_id; uint16_t idx; req = (struct cnxk_ml_req *)request; result = &req->cn10k_req.result; op = req->op; if (likely(result->error_code == 0)) { idx = req->cn10k_req.jd.hdr.model_id; model_id = cnxk_mldev->index_map[idx].model_id; layer_id = cnxk_mldev->index_map[idx].layer_id; model = cnxk_mldev->mldev->data->models[model_id]; layer = &model->layer[layer_id]; if (likely(qp_id >= 0)) { qp = cnxk_mldev->mldev->data->queue_pairs[qp_id]; qp->stats.dequeued_count++; xstats = &layer->glow.burst_xstats[qp_id]; } else { xstats = layer->glow.sync_xstats; } if (unlikely(xstats->dequeued_count == xstats->hw_reset_count)) { xstats->hw_latency_min = UINT64_MAX; xstats->hw_latency_max = 0; } if (unlikely(xstats->dequeued_count == xstats->fw_reset_count)) { xstats->fw_latency_min = UINT64_MAX; xstats->fw_latency_max = 0; } hw_latency = result->stats.hw_end - result->stats.hw_start; fw_latency = result->stats.fw_end - result->stats.fw_start - hw_latency; xstats->hw_latency_tot += hw_latency; xstats->hw_latency_min = PLT_MIN(xstats->hw_latency_min, hw_latency); xstats->hw_latency_max = PLT_MAX(xstats->hw_latency_max, hw_latency); xstats->fw_latency_tot += fw_latency; xstats->fw_latency_min = PLT_MIN(xstats->fw_latency_min, fw_latency); xstats->fw_latency_max = PLT_MAX(xstats->fw_latency_max, fw_latency); xstats->dequeued_count++; op->impl_opaque = result->error_code; op->status = RTE_ML_OP_STATUS_SUCCESS; } else { if (likely(qp_id >= 0)) { qp = cnxk_mldev->mldev->data->queue_pairs[qp_id]; qp->stats.dequeue_err_count++; } /* Handle driver error */ error_code = (union cn10k_ml_error_code *)&result->error_code; if (error_code->s.etype == ML_CNXK_ETYPE_DRIVER) { cn10k_mldev = &cnxk_mldev->cn10k_mldev; /* Check for exception */ if ((roc_ml_reg_read64(&cn10k_mldev->roc, ML_SCRATCH_EXCEPTION_SP_C0) != 0) || (roc_ml_reg_read64(&cn10k_mldev->roc, ML_SCRATCH_EXCEPTION_SP_C1) != 0)) error_code->s.stype = ML_CN10K_DRIVER_ERR_EXCEPTION; else if ((roc_ml_reg_read64(&cn10k_mldev->roc, ML_CORE_INT_LO) != 0) || (roc_ml_reg_read64(&cn10k_mldev->roc, ML_CORE_INT_HI) != 0)) error_code->s.stype = ML_CN10K_DRIVER_ERR_FW_ERROR; else error_code->s.stype = ML_CN10K_DRIVER_ERR_UNKNOWN; } op->impl_opaque = result->error_code; op->status = RTE_ML_OP_STATUS_ERROR; } op->user_ptr = result->user_ptr; } __rte_hot void cn10k_ml_set_error_code(struct cnxk_ml_req *req, uint64_t etype, uint64_t stype) { union cn10k_ml_error_code *error_code; error_code = (union cn10k_ml_error_code *)&req->cn10k_req.result.error_code; error_code->s.etype = etype; error_code->s.stype = stype; } __rte_hot bool cn10k_ml_enqueue_single(struct cnxk_ml_dev *cnxk_mldev, struct rte_ml_op *op, uint16_t layer_id, struct cnxk_ml_qp *qp, uint64_t head) { union cn10k_ml_error_code *error_code; struct cn10k_ml_dev *cn10k_mldev; struct cnxk_ml_model *model; struct cnxk_ml_queue *queue; struct cnxk_ml_req *req; cn10k_mldev = &cnxk_mldev->cn10k_mldev; queue = &qp->queue; req = &queue->reqs[head]; model = cnxk_mldev->mldev->data->models[op->model_id]; model->set_poll_addr(req); cn10k_ml_prep_fp_job_descriptor(cnxk_mldev, req, model->layer[layer_id].index, op->input[0]->addr, op->output[0]->addr, op->nb_batches); memset(&req->cn10k_req.result, 0, sizeof(struct cn10k_ml_result)); error_code = (union cn10k_ml_error_code *)&req->cn10k_req.result.error_code; error_code->s.etype = ML_CNXK_ETYPE_UNKNOWN; req->cn10k_req.result.user_ptr = op->user_ptr; cnxk_ml_set_poll_ptr(req); if (unlikely(!cn10k_mldev->ml_jcmdq_enqueue(&cn10k_mldev->roc, &req->cn10k_req.jcmd))) return false; req->timeout = plt_tsc_cycles() + queue->wait_cycles; req->op = op; return true; } __rte_hot int cn10k_ml_op_error_get(struct rte_ml_dev *dev, struct rte_ml_op *op, struct rte_ml_op_error *error) { union cn10k_ml_error_code *error_code; PLT_SET_USED(dev); error_code = (union cn10k_ml_error_code *)&op->impl_opaque; /* Copy sub error message */ if (error_code->s.etype == ML_CNXK_ETYPE_HW_NONFATAL) { if (error_code->s.stype < PLT_DIM(ml_stype_db_hw_nf)) snprintf(error->message, RTE_ML_STR_MAX, "%s : %s", ml_etype_db[error_code->s.etype].str, ml_stype_db_hw_nf[error_code->s.stype].str); else snprintf(error->message, RTE_ML_STR_MAX, "%s : UNKNOWN ERROR", ml_etype_db[error_code->s.etype].str); } else if (error_code->s.etype == ML_CNXK_ETYPE_DRIVER) { snprintf(error->message, RTE_ML_STR_MAX, "%s : %s", ml_etype_db[error_code->s.etype].str, ml_stype_db_driver[error_code->s.stype].str); } else { snprintf(error->message, RTE_ML_STR_MAX, "%s", ml_etype_db[error_code->s.etype].str); } error->errcode = error_code->u64; return 0; } __rte_hot int cn10k_ml_inference_sync(void *device, uint16_t index, void *input, void *output, uint16_t nb_batches) { union cn10k_ml_error_code *error_code; struct cn10k_ml_dev *cn10k_mldev; struct cnxk_ml_dev *cnxk_mldev; struct cnxk_ml_model *model; struct cnxk_ml_layer *layer; struct cnxk_ml_req *req; struct rte_ml_op op; uint16_t model_id; uint16_t layer_id; bool timeout; int ret = 0; cnxk_mldev = (struct cnxk_ml_dev *)device; cn10k_mldev = &cnxk_mldev->cn10k_mldev; model_id = cnxk_mldev->index_map[index].model_id; layer_id = cnxk_mldev->index_map[index].layer_id; model = cnxk_mldev->mldev->data->models[model_id]; layer = &model->layer[layer_id]; req = layer->glow.req; op.model_id = index; op.impl_opaque = 0; cn10k_ml_set_poll_addr(req); cn10k_ml_prep_fp_job_descriptor(cnxk_mldev, req, index, input, output, nb_batches); memset(&req->cn10k_req.result, 0, sizeof(struct cn10k_ml_result)); error_code = (union cn10k_ml_error_code *)&req->cn10k_req.result.error_code; error_code->s.etype = ML_CNXK_ETYPE_UNKNOWN; req->cn10k_req.result.user_ptr = NULL; cnxk_ml_set_poll_ptr(req); req->cn10k_req.jcmd.w1.s.jobptr = PLT_U64_CAST(&req->cn10k_req.jd); timeout = true; req->timeout = plt_tsc_cycles() + ML_CNXK_CMD_TIMEOUT * plt_tsc_hz(); do { if (cn10k_mldev->ml_jcmdq_enqueue(&cn10k_mldev->roc, &req->cn10k_req.jcmd)) { req->op = &op; timeout = false; break; } } while (plt_tsc_cycles() < req->timeout); if (timeout) { ret = -EBUSY; goto error_enqueue; } timeout = true; do { if (cnxk_ml_get_poll_ptr(req) == ML_CNXK_POLL_JOB_FINISH) { timeout = false; break; } } while (plt_tsc_cycles() < req->timeout); if (timeout) ret = -ETIME; else cn10k_ml_result_update(cnxk_mldev, -1, req); error_enqueue: return ret; } int cn10k_ml_io_alloc(void *device, uint16_t model_id, const char *layer_name, uint64_t **input_qbuffer, uint64_t **output_qbuffer) { struct cnxk_ml_dev *cnxk_mldev; struct cnxk_ml_model *model; struct cnxk_ml_layer *layer; char str[RTE_MEMZONE_NAMESIZE]; const struct plt_memzone *mz; uint64_t output_size; uint64_t input_size; uint16_t layer_id; int ret; cnxk_mldev = (struct cnxk_ml_dev *)device; if (cnxk_mldev == NULL) { plt_err("Invalid device = %p", device); return -EINVAL; } model = cnxk_mldev->mldev->data->models[model_id]; if (model == NULL) { plt_err("Invalid model_id = %u", model_id); return -EINVAL; } ret = cn10k_ml_model_get_layer_id(model, layer_name, &layer_id); if (ret != 0) return ret; layer = &model->layer[layer_id]; input_size = PLT_ALIGN_CEIL(layer->info.total_input_sz_q, ML_CN10K_ALIGN_SIZE); output_size = PLT_ALIGN_CEIL(layer->info.total_output_sz_q, ML_CN10K_ALIGN_SIZE); sprintf(str, "cn10k_ml_io_mz_%u_%u", model_id, layer_id); mz = plt_memzone_reserve_aligned(str, input_size + output_size, 0, ML_CN10K_ALIGN_SIZE); if (mz == NULL) { plt_err("io_alloc failed: Unable to allocate memory: model_id = %u, layer_name = %s", model_id, layer_name); return -ENOMEM; } *input_qbuffer = mz->addr; *output_qbuffer = PLT_PTR_ADD(mz->addr, input_size); return 0; } int cn10k_ml_io_free(void *device, uint16_t model_id, const char *layer_name) { struct cnxk_ml_dev *cnxk_mldev; struct cnxk_ml_model *model; char str[RTE_MEMZONE_NAMESIZE]; const struct plt_memzone *mz; uint16_t layer_id; int ret; cnxk_mldev = (struct cnxk_ml_dev *)device; if (cnxk_mldev == NULL) { plt_err("Invalid device = %p", device); return -EINVAL; } model = cnxk_mldev->mldev->data->models[model_id]; if (model == NULL) { plt_err("Invalid model_id = %u", model_id); return -EINVAL; } ret = cn10k_ml_model_get_layer_id(model, layer_name, &layer_id); if (ret != 0) return ret; sprintf(str, "cn10k_ml_io_mz_%u_%u", model_id, layer_id); mz = plt_memzone_lookup(str); if (mz == NULL) { plt_err("io_free failed: Memzone not found: model_id = %u, layer_name = %s", model_id, layer_name); return -EINVAL; } return plt_memzone_free(mz); } int cn10k_ml_malloc(const char *name, size_t size, uint32_t align, void **addr) { const struct plt_memzone *mz; mz = plt_memzone_reserve_aligned(name, size, 0, align); if (mz == NULL) { plt_err("ml_malloc failed: Unable to allocate memory: name = %s", name); return -ENOMEM; } *addr = mz->addr; return 0; } int cn10k_ml_free(const char *name) { const struct plt_memzone *mz; mz = plt_memzone_lookup(name); if (mz == NULL) { plt_err("ml_free failed: Memzone not found: name = %s", name); return -EINVAL; } return plt_memzone_free(mz); }