/* SPDX-License-Identifier: BSD-3-Clause * Copyright(c) 2020-2021 HiSilicon Limited. */ #ifndef _HNS3_RXTX_VEC_NEON_H_ #define _HNS3_RXTX_VEC_NEON_H_ #include #pragma GCC diagnostic ignored "-Wcast-qual" static inline void hns3_vec_tx(volatile struct hns3_desc *desc, struct rte_mbuf *pkt) { uint64x2_t val1 = { pkt->buf_iova + pkt->data_off, ((uint64_t)pkt->data_len) << HNS3_TXD_SEND_SIZE_SHIFT }; uint64x2_t val2 = { 0, ((uint64_t)HNS3_TXD_DEFAULT_VLD_FE_BDTYPE) << HNS3_UINT32_BIT }; vst1q_u64((uint64_t *)&desc->addr, val1); vst1q_u64((uint64_t *)&desc->tx.outer_vlan_tag, val2); } static uint16_t hns3_xmit_fixed_burst_vec(void *__restrict tx_queue, struct rte_mbuf **__restrict tx_pkts, uint16_t nb_pkts) { struct hns3_tx_queue *txq = (struct hns3_tx_queue *)tx_queue; volatile struct hns3_desc *tx_desc; struct hns3_entry *tx_entry; uint16_t next_to_use; uint16_t nb_commit; uint16_t nb_tx; uint16_t n, i; if (txq->tx_bd_ready < txq->tx_free_thresh) hns3_tx_free_buffers(txq); nb_commit = RTE_MIN(txq->tx_bd_ready, nb_pkts); if (unlikely(nb_commit == 0)) { txq->queue_full_cnt++; return 0; } nb_tx = nb_commit; next_to_use = txq->next_to_use; tx_desc = &txq->tx_ring[next_to_use]; tx_entry = &txq->sw_ring[next_to_use]; /* * We need to deal with n descriptors first for better performance, * if nb_commit is greater than the difference between txq->nb_tx_desc * and next_to_use in sw_ring and tx_ring. */ n = txq->nb_tx_desc - next_to_use; if (nb_commit >= n) { for (i = 0; i < n; i++, tx_pkts++, tx_desc++) { hns3_vec_tx(tx_desc, *tx_pkts); tx_entry[i].mbuf = *tx_pkts; } nb_commit -= n; next_to_use = 0; tx_desc = &txq->tx_ring[next_to_use]; tx_entry = &txq->sw_ring[next_to_use]; } for (i = 0; i < nb_commit; i++, tx_pkts++, tx_desc++) { hns3_vec_tx(tx_desc, *tx_pkts); tx_entry[i].mbuf = *tx_pkts; } next_to_use += nb_commit; txq->next_to_use = next_to_use; txq->tx_bd_ready -= nb_tx; hns3_write_reg_opt(txq->io_tail_reg, nb_tx); return nb_tx; } static inline uint32_t hns3_desc_parse_field(struct hns3_rx_queue *rxq, struct hns3_entry *sw_ring, struct hns3_desc *rxdp, uint32_t bd_vld_num) { uint32_t l234_info, ol_info, bd_base_info; struct rte_mbuf *pkt; uint32_t retcode = 0; uint32_t cksum_err; uint32_t i; int ret; for (i = 0; i < bd_vld_num; i++) { pkt = sw_ring[i].mbuf; /* init rte_mbuf.rearm_data last 64-bit */ pkt->ol_flags = PKT_RX_RSS_HASH; l234_info = rxdp[i].rx.l234_info; ol_info = rxdp[i].rx.ol_info; bd_base_info = rxdp[i].rx.bd_base_info; ret = hns3_handle_bdinfo(rxq, pkt, bd_base_info, l234_info, &cksum_err); if (unlikely(ret)) { retcode |= 1u << i; continue; } pkt->packet_type = hns3_rx_calc_ptype(rxq, l234_info, ol_info); if (likely(bd_base_info & BIT(HNS3_RXD_L3L4P_B))) hns3_rx_set_cksum_flag(pkt, pkt->packet_type, cksum_err); } return retcode; } static inline uint16_t hns3_recv_burst_vec(struct hns3_rx_queue *__restrict rxq, struct rte_mbuf **__restrict rx_pkts, uint16_t nb_pkts, uint64_t *bd_err_mask) { uint16_t rx_id = rxq->next_to_use; struct hns3_entry *sw_ring = &rxq->sw_ring[rx_id]; struct hns3_desc *rxdp = &rxq->rx_ring[rx_id]; uint32_t bd_valid_num, parse_retcode; uint16_t nb_rx = 0; uint32_t pos; int offset; /* mask to shuffle from desc to mbuf's rx_descriptor_fields1 */ uint8x16_t shuf_desc_fields_msk = { 0xff, 0xff, 0xff, 0xff, /* packet type init zero */ 22, 23, 0xff, 0xff, /* rx.pkt_len to rte_mbuf.pkt_len */ 20, 21, /* size to rte_mbuf.data_len */ 0xff, 0xff, /* rte_mbuf.vlan_tci init zero */ 8, 9, 10, 11, /* rx.rss_hash to rte_mbuf.hash.rss */ }; uint16x8_t crc_adjust = { 0, 0, /* ignore pkt_type field */ rxq->crc_len, /* sub crc on pkt_len */ 0, /* ignore high-16bits of pkt_len */ rxq->crc_len, /* sub crc on data_len */ 0, 0, 0, /* ignore non-length fields */ }; for (pos = 0; pos < nb_pkts; pos += HNS3_DEFAULT_DESCS_PER_LOOP, rxdp += HNS3_DEFAULT_DESCS_PER_LOOP) { uint64x2x2_t descs[HNS3_DEFAULT_DESCS_PER_LOOP]; uint8x16x2_t pkt_mbuf1, pkt_mbuf2, pkt_mbuf3, pkt_mbuf4; uint8x16_t pkt_mb1, pkt_mb2, pkt_mb3, pkt_mb4; uint64x2_t mbp1, mbp2; uint16x4_t bd_vld = {0}; uint16x8_t tmp; uint64_t stat; /* calc how many bd valid */ bd_vld = vset_lane_u16(rxdp[0].rx.bdtype_vld_udp0, bd_vld, 0); bd_vld = vset_lane_u16(rxdp[1].rx.bdtype_vld_udp0, bd_vld, 1); bd_vld = vset_lane_u16(rxdp[2].rx.bdtype_vld_udp0, bd_vld, 2); bd_vld = vset_lane_u16(rxdp[3].rx.bdtype_vld_udp0, bd_vld, 3); /* load 2 mbuf pointer */ mbp1 = vld1q_u64((uint64_t *)&sw_ring[pos]); bd_vld = vshl_n_u16(bd_vld, HNS3_UINT16_BIT - 1 - HNS3_RXD_VLD_B); bd_vld = vreinterpret_u16_s16( vshr_n_s16(vreinterpret_s16_u16(bd_vld), HNS3_UINT16_BIT - 1)); stat = ~vget_lane_u64(vreinterpret_u64_u16(bd_vld), 0); /* load 2 mbuf pointer again */ mbp2 = vld1q_u64((uint64_t *)&sw_ring[pos + 2]); if (likely(stat == 0)) bd_valid_num = HNS3_DEFAULT_DESCS_PER_LOOP; else bd_valid_num = __builtin_ctzl(stat) / HNS3_UINT16_BIT; if (bd_valid_num == 0) break; /* use offset to control below data load oper ordering */ offset = rxq->offset_table[bd_valid_num]; /* store 2 mbuf pointer into rx_pkts */ vst1q_u64((uint64_t *)&rx_pkts[pos], mbp1); /* read first two descs */ descs[0] = vld2q_u64((uint64_t *)(rxdp + offset)); descs[1] = vld2q_u64((uint64_t *)(rxdp + offset + 1)); /* store 2 mbuf pointer into rx_pkts again */ vst1q_u64((uint64_t *)&rx_pkts[pos + 2], mbp2); /* read remains two descs */ descs[2] = vld2q_u64((uint64_t *)(rxdp + offset + 2)); descs[3] = vld2q_u64((uint64_t *)(rxdp + offset + 3)); pkt_mbuf1.val[0] = vreinterpretq_u8_u64(descs[0].val[0]); pkt_mbuf1.val[1] = vreinterpretq_u8_u64(descs[0].val[1]); pkt_mbuf2.val[0] = vreinterpretq_u8_u64(descs[1].val[0]); pkt_mbuf2.val[1] = vreinterpretq_u8_u64(descs[1].val[1]); /* pkt 1,2 convert format from desc to pktmbuf */ pkt_mb1 = vqtbl2q_u8(pkt_mbuf1, shuf_desc_fields_msk); pkt_mb2 = vqtbl2q_u8(pkt_mbuf2, shuf_desc_fields_msk); /* store the first 8 bytes of pkt 1,2 mbuf's rearm_data */ *(uint64_t *)&sw_ring[pos + 0].mbuf->rearm_data = rxq->mbuf_initializer; *(uint64_t *)&sw_ring[pos + 1].mbuf->rearm_data = rxq->mbuf_initializer; /* pkt 1,2 remove crc */ tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb1), crc_adjust); pkt_mb1 = vreinterpretq_u8_u16(tmp); tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb2), crc_adjust); pkt_mb2 = vreinterpretq_u8_u16(tmp); pkt_mbuf3.val[0] = vreinterpretq_u8_u64(descs[2].val[0]); pkt_mbuf3.val[1] = vreinterpretq_u8_u64(descs[2].val[1]); pkt_mbuf4.val[0] = vreinterpretq_u8_u64(descs[3].val[0]); pkt_mbuf4.val[1] = vreinterpretq_u8_u64(descs[3].val[1]); /* pkt 3,4 convert format from desc to pktmbuf */ pkt_mb3 = vqtbl2q_u8(pkt_mbuf3, shuf_desc_fields_msk); pkt_mb4 = vqtbl2q_u8(pkt_mbuf4, shuf_desc_fields_msk); /* pkt 1,2 save to rx_pkts mbuf */ vst1q_u8((void *)&sw_ring[pos + 0].mbuf->rx_descriptor_fields1, pkt_mb1); vst1q_u8((void *)&sw_ring[pos + 1].mbuf->rx_descriptor_fields1, pkt_mb2); /* pkt 3,4 remove crc */ tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb3), crc_adjust); pkt_mb3 = vreinterpretq_u8_u16(tmp); tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb4), crc_adjust); pkt_mb4 = vreinterpretq_u8_u16(tmp); /* store the first 8 bytes of pkt 3,4 mbuf's rearm_data */ *(uint64_t *)&sw_ring[pos + 2].mbuf->rearm_data = rxq->mbuf_initializer; *(uint64_t *)&sw_ring[pos + 3].mbuf->rearm_data = rxq->mbuf_initializer; /* pkt 3,4 save to rx_pkts mbuf */ vst1q_u8((void *)&sw_ring[pos + 2].mbuf->rx_descriptor_fields1, pkt_mb3); vst1q_u8((void *)&sw_ring[pos + 3].mbuf->rx_descriptor_fields1, pkt_mb4); rte_prefetch_non_temporal(rxdp + HNS3_DEFAULT_DESCS_PER_LOOP); parse_retcode = hns3_desc_parse_field(rxq, &sw_ring[pos], &rxdp[offset], bd_valid_num); if (unlikely(parse_retcode)) (*bd_err_mask) |= ((uint64_t)parse_retcode) << pos; rte_prefetch0(sw_ring[pos + HNS3_DEFAULT_DESCS_PER_LOOP + 0].mbuf); rte_prefetch0(sw_ring[pos + HNS3_DEFAULT_DESCS_PER_LOOP + 1].mbuf); rte_prefetch0(sw_ring[pos + HNS3_DEFAULT_DESCS_PER_LOOP + 2].mbuf); rte_prefetch0(sw_ring[pos + HNS3_DEFAULT_DESCS_PER_LOOP + 3].mbuf); nb_rx += bd_valid_num; if (bd_valid_num < HNS3_DEFAULT_DESCS_PER_LOOP) break; } rxq->rx_rearm_nb += nb_rx; rxq->next_to_use += nb_rx; if (rxq->next_to_use >= rxq->nb_rx_desc) rxq->next_to_use = 0; return nb_rx; } #endif /* _HNS3_RXTX_VEC_NEON_H_ */