/* SPDX-License-Identifier: BSD-3-Clause * Copyright(C) 2022 Marvell. */ #ifndef IPSEC_NEON_H #define IPSEC_NEON_H #include "ipsec.h" #include "neon/port_group.h" #define MAX_TX_BURST (MAX_PKT_BURST / 2) extern xmm_t val_eth[RTE_MAX_ETHPORTS]; /* * Update source and destination MAC addresses in the ethernet header. */ static inline void processx4_step3(struct rte_mbuf *pkts[FWDSTEP], uint16_t dst_port[FWDSTEP], uint64_t tx_offloads, bool ip_cksum, bool is_ipv4, uint8_t *l_pkt) { uint32x4_t te[FWDSTEP]; uint32x4_t ve[FWDSTEP]; uint32_t *p[FWDSTEP]; struct rte_mbuf *pkt; uint32_t val; uint8_t i; for (i = 0; i < FWDSTEP; i++) { pkt = pkts[i]; /* Check if it is a large packet */ if (pkt->pkt_len - RTE_ETHER_HDR_LEN > mtu_size) *l_pkt |= 1; p[i] = rte_pktmbuf_mtod(pkt, uint32_t *); ve[i] = vreinterpretq_u32_s32(val_eth[dst_port[i]]); te[i] = vld1q_u32(p[i]); /* Update last 4 bytes */ val = vgetq_lane_u32(te[i], 3); #if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN val &= 0xFFFFUL << 16; val |= rte_cpu_to_be_16(is_ipv4 ? RTE_ETHER_TYPE_IPV4 : RTE_ETHER_TYPE_IPV6); #else val &= 0xFFFFUL; val |= rte_cpu_to_be_16(is_ipv4 ? RTE_ETHER_TYPE_IPV4 : RTE_ETHER_TYPE_IPV6) << 16; #endif ve[i] = vsetq_lane_u32(val, ve[i], 3); vst1q_u32(p[i], ve[i]); if (ip_cksum) { struct rte_ipv4_hdr *ip; pkt->ol_flags |= tx_offloads; ip = (struct rte_ipv4_hdr *) (((uintptr_t)p[i]) + RTE_ETHER_HDR_LEN); ip->hdr_checksum = 0; /* calculate IPv4 cksum in SW */ if ((pkt->ol_flags & RTE_MBUF_F_TX_IP_CKSUM) == 0) ip->hdr_checksum = rte_ipv4_cksum(ip); } } } /** * Process single packet: * Update source and destination MAC addresses in the ethernet header. */ static inline void process_packet(struct rte_mbuf *pkt, uint16_t *dst_port, uint64_t tx_offloads, bool ip_cksum, bool is_ipv4, uint8_t *l_pkt) { struct rte_ether_hdr *eth_hdr; uint32x4_t te, ve; uint32_t val; /* Check if it is a large packet */ if (pkt->pkt_len - RTE_ETHER_HDR_LEN > mtu_size) *l_pkt |= 1; eth_hdr = rte_pktmbuf_mtod(pkt, struct rte_ether_hdr *); te = vld1q_u32((uint32_t *)eth_hdr); ve = vreinterpretq_u32_s32(val_eth[dst_port[0]]); val = vgetq_lane_u32(te, 3); #if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN val &= 0xFFFFUL << 16; val |= rte_cpu_to_be_16(is_ipv4 ? RTE_ETHER_TYPE_IPV4 : RTE_ETHER_TYPE_IPV6); #else val &= 0xFFFFUL; val |= rte_cpu_to_be_16(is_ipv4 ? RTE_ETHER_TYPE_IPV4 : RTE_ETHER_TYPE_IPV6) << 16; #endif ve = vsetq_lane_u32(val, ve, 3); vst1q_u32((uint32_t *)eth_hdr, ve); if (ip_cksum) { struct rte_ipv4_hdr *ip; pkt->ol_flags |= tx_offloads; ip = (struct rte_ipv4_hdr *)(eth_hdr + 1); ip->hdr_checksum = 0; /* calculate IPv4 cksum in SW */ if ((pkt->ol_flags & RTE_MBUF_F_TX_IP_CKSUM) == 0) ip->hdr_checksum = rte_ipv4_cksum(ip); } } static inline void send_packets(struct rte_mbuf *m[], uint16_t port, uint32_t num, bool is_ipv4) { uint8_t proto; uint32_t i; proto = is_ipv4 ? IPPROTO_IP : IPPROTO_IPV6; for (i = 0; i < num; i++) send_single_packet(m[i], port, proto); } static inline void send_packetsx4(struct rte_mbuf *m[], uint16_t port, uint32_t num) { unsigned int lcoreid = rte_lcore_id(); struct lcore_conf *qconf; uint32_t len, j, n; qconf = &lcore_conf[lcoreid]; len = qconf->tx_mbufs[port].len; /* * If TX buffer for that queue is empty, and we have enough packets, * then send them straightway. */ if (num >= MAX_TX_BURST && len == 0) { n = rte_eth_tx_burst(port, qconf->tx_queue_id[port], m, num); core_stats_update_tx(n); if (unlikely(n < num)) { do { rte_pktmbuf_free(m[n]); } while (++n < num); } return; } /* * Put packets into TX buffer for that queue. */ n = len + num; n = (n > MAX_PKT_BURST) ? MAX_PKT_BURST - len : num; j = 0; switch (n % FWDSTEP) { while (j < n) { case 0: qconf->tx_mbufs[port].m_table[len + j] = m[j]; j++; /* fallthrough */ case 3: qconf->tx_mbufs[port].m_table[len + j] = m[j]; j++; /* fallthrough */ case 2: qconf->tx_mbufs[port].m_table[len + j] = m[j]; j++; /* fallthrough */ case 1: qconf->tx_mbufs[port].m_table[len + j] = m[j]; j++; } } len += n; /* enough pkts to be sent */ if (unlikely(len == MAX_PKT_BURST)) { send_burst(qconf, MAX_PKT_BURST, port); /* copy rest of the packets into the TX buffer. */ len = num - n; if (len == 0) goto exit; j = 0; switch (len % FWDSTEP) { while (j < len) { case 0: qconf->tx_mbufs[port].m_table[j] = m[n + j]; j++; /* fallthrough */ case 3: qconf->tx_mbufs[port].m_table[j] = m[n + j]; j++; /* fallthrough */ case 2: qconf->tx_mbufs[port].m_table[j] = m[n + j]; j++; /* fallthrough */ case 1: qconf->tx_mbufs[port].m_table[j] = m[n + j]; j++; } } } exit: qconf->tx_mbufs[port].len = len; } /** * Send packets burst to the ports in dst_port array */ static __rte_always_inline void send_multi_pkts(struct rte_mbuf **pkts, uint16_t dst_port[MAX_PKT_BURST], int nb_rx, uint64_t tx_offloads, bool ip_cksum, bool is_ipv4) { unsigned int lcoreid = rte_lcore_id(); uint16_t pnum[MAX_PKT_BURST + 1]; uint8_t l_pkt = 0; uint16_t dlp, *lp; int i = 0, k; /* * Finish packet processing and group consecutive * packets with the same destination port. */ k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP); if (k != 0) { uint16x8_t dp1, dp2; lp = pnum; lp[0] = 1; processx4_step3(pkts, dst_port, tx_offloads, ip_cksum, is_ipv4, &l_pkt); /* dp1: */ dp1 = vld1q_u16(dst_port); for (i = FWDSTEP; i != k; i += FWDSTEP) { processx4_step3(&pkts[i], &dst_port[i], tx_offloads, ip_cksum, is_ipv4, &l_pkt); /* * dp2: * */ dp2 = vld1q_u16(&dst_port[i - FWDSTEP + 1]); lp = port_groupx4(&pnum[i - FWDSTEP], lp, dp1, dp2); /* * dp1: * */ dp1 = vextq_u16(dp2, dp1, FWDSTEP - 1); } /* * dp2: */ dp2 = vextq_u16(dp1, dp1, 1); dp2 = vsetq_lane_u16(vgetq_lane_u16(dp2, 2), dp2, 3); lp = port_groupx4(&pnum[i - FWDSTEP], lp, dp1, dp2); /* * remove values added by the last repeated * dst port. */ lp[0]--; dlp = dst_port[i - 1]; } else { /* set dlp and lp to the never used values. */ dlp = BAD_PORT - 1; lp = pnum + MAX_PKT_BURST; } /* Process up to last 3 packets one by one. */ switch (nb_rx % FWDSTEP) { case 3: process_packet(pkts[i], dst_port + i, tx_offloads, ip_cksum, is_ipv4, &l_pkt); GROUP_PORT_STEP(dlp, dst_port, lp, pnum, i); i++; /* fallthrough */ case 2: process_packet(pkts[i], dst_port + i, tx_offloads, ip_cksum, is_ipv4, &l_pkt); GROUP_PORT_STEP(dlp, dst_port, lp, pnum, i); i++; /* fallthrough */ case 1: process_packet(pkts[i], dst_port + i, tx_offloads, ip_cksum, is_ipv4, &l_pkt); GROUP_PORT_STEP(dlp, dst_port, lp, pnum, i); } /* * Send packets out, through destination port. * Consecutive packets with the same destination port * are already grouped together. * If destination port for the packet equals BAD_PORT, * then free the packet without sending it out. */ for (i = 0; i < nb_rx; i += k) { uint16_t pn; pn = dst_port[i]; k = pnum[i]; if (likely(pn != BAD_PORT)) { if (l_pkt) /* Large packet is present, need to send * individual packets with fragment */ send_packets(pkts + i, pn, k, is_ipv4); else send_packetsx4(pkts + i, pn, k); } else { free_pkts(&pkts[i], k); if (is_ipv4) core_statistics[lcoreid].lpm4.miss++; else core_statistics[lcoreid].lpm6.miss++; } } } #endif /* IPSEC_NEON_H */