17 #define _T_simd __m512i
18 #define _T_mask __mmask16
21 #define _SC_(x) zmm_##x
22 #define _SV_(x) (zmm_##x.z)
25 #define _F_(x) x##_avx512x16
33 #define _M_I_(x) _mm512_##x
36 #define _M_SI_(x) _mm512_##x##_si512
39 #define _M_MGI_(x) _mm512_##x
42 #define _M_GI_(name, idx, base, scale) _mm512_##name(idx, base, scale)
45 #define _SIMD_MASK_BIT_ (sizeof(_T_simd) / sizeof(uint32_t))
46 #define _SIMD_MASK_MAX_ CNE_LEN2MASK(_SIMD_MASK_BIT_, uint32_t)
48 #define _SIMD_FLOW_NUM_ (2 * _SIMD_MASK_BIT_)
49 #define _SIMD_FLOW_MSK_ (_SIMD_FLOW_NUM_ - 1)
52 #define _SIMD_PTR_NUM_ (sizeof(_T_simd) / sizeof(uintptr_t))
53 #define _SIMD_PTR_MSK_ CNE_LEN2MASK(_SIMD_PTR_NUM_, uint32_t)
55 static const __cne_x86_zmm_t _SC_(match_mask) = {
77 static const __cne_x86_zmm_t _SC_(index_mask) = {
99 static const __cne_x86_zmm_t _SC_(trlo_idle) = {
122 static const __cne_x86_zmm_t _SC_(trhi_idle) = {
131 static const __cne_x86_zmm_t _SC_(shuffle_input) = {
133 0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,
134 0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,
135 0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,
136 0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,
140 static const __cne_x86_zmm_t _SC_(four_32) = {
149 static const __cne_x86_zmm_t _SC_(idx_add) = {
158 static const __cne_x86_zmm_t _SC_(range_base) = {
160 0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,
161 0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,
162 0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,
163 0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,
167 static const __cne_x86_zmm_t _SC_(pminp) = {
169 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
170 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
174 static const _T_mask _SC_(pmidx_msk) = 0x5555;
176 static const __cne_x86_zmm_t _SC_(pmidx[2]) = {
179 0, 0, 1, 0, 2, 0, 3, 0,
180 4, 0, 5, 0, 6, 0, 7, 0,
185 8, 0, 9, 0, 10, 0, 11, 0,
186 12, 0, 13, 0, 14, 0, 15, 0,
196 static inline __m256i
197 _m512_mask_gather_epi8x8(__m512i pdata, __mmask8 mask)
202 static const uint32_t zero;
204 p.z = _mm512_mask_set1_epi64(pdata, mask ^ _SIMD_PTR_MSK_, (uintptr_t)&zero);
206 v.u32[0] = *(uint8_t *)p.u64[0];
207 v.u32[1] = *(uint8_t *)p.u64[1];
208 v.u32[2] = *(uint8_t *)p.u64[2];
209 v.u32[3] = *(uint8_t *)p.u64[3];
210 v.u32[4] = *(uint8_t *)p.u64[4];
211 v.u32[5] = *(uint8_t *)p.u64[5];
212 v.u32[6] = *(uint8_t *)p.u64[6];
213 v.u32[7] = *(uint8_t *)p.u64[7];
222 _F_(gather_bytes)(__m512i zero,
const __m512i p[2],
const uint32_t m[2], uint32_t bnum)
226 if (bnum ==
sizeof(uint8_t)) {
227 inp[0] = _m512_mask_gather_epi8x8(p[0], m[0]);
228 inp[1] = _m512_mask_gather_epi8x8(p[1], m[1]);
230 inp[0] = _mm512_mask_i64gather_epi32(_mm512_castsi512_si256(zero), m[0], p[0], NULL,
232 inp[1] = _mm512_mask_i64gather_epi32(_mm512_castsi512_si256(zero), m[1], p[1], NULL,
237 return _mm512_permutex2var_epi32(_mm512_castsi256_si512(inp[0]), _SV_(pminp),
238 _mm512_castsi256_si512(inp[1]));
245 resolve_mcgt8_avx512x1(uint32_t result[],
const struct cne_acl_match_results pr[],
246 const uint32_t match[], uint32_t nb_pkt, uint32_t nb_cat, uint32_t nb_trie)
249 const uint32_t *pm, *res;
252 __m512i cp, cr, np, nr;
254 const uint32_t match_log = 5;
259 cm = (1 << nb_cat) - 1;
261 for (k = 0; k != nb_pkt; k++, result += nb_cat) {
263 mi = match[k] << match_log;
265 cr = _mm512_maskz_loadu_epi32(cm, res + mi);
266 cp = _mm512_maskz_loadu_epi32(cm, pri + mi);
268 for (i = 1, pm = match + nb_pkt; i != nb_trie; i++, pm += nb_pkt) {
270 mi = pm[k] << match_log;
272 nr = _mm512_maskz_loadu_epi32(cm, res + mi);
273 np = _mm512_maskz_loadu_epi32(cm, pri + mi);
275 sm = _mm512_cmpgt_epi32_mask(cp, np);
276 cr = _mm512_mask_mov_epi32(nr, sm, cr);
277 cp = _mm512_mask_mov_epi32(np, sm, cp);
280 _mm512_mask_storeu_epi32(result, cm, cr);
284 #include "acl_run_avx512_common.h"
291 search_avx512x16x2(
const struct cne_acl_ctx *ctx,
const uint8_t **data, uint32_t *results,
292 uint32_t total_packets, uint32_t categories)
295 const struct cne_acl_match_results *pr;
296 struct acl_flow_avx512 flow;
297 uint32_t match[ctx->num_tries * total_packets];
299 for (i = 0, pm = match; i != ctx->num_tries; i++, pm += total_packets) {
302 acl_set_flow_avx512(&flow, ctx, i, data, pm, total_packets);
305 _F_(search_trie)(&flow);
309 pr = (
const struct cne_acl_match_results *)(ctx->trans_table + ctx->match_index);
312 _F_(resolve_single_cat)(results, pr, match, total_packets, ctx->num_tries);
313 else if (categories <= CNE_ACL_MAX_CATEGORIES / 2)
314 resolve_mcle8_avx512x1(results, pr, match, total_packets, categories, ctx->num_tries);
316 resolve_mcgt8_avx512x1(results, pr, match, total_packets, categories, ctx->num_tries);
321 #undef _SIMD_PTR_MSK_
322 #undef _SIMD_PTR_NUM_
323 #undef _SIMD_FLOW_MSK_
324 #undef _SIMD_FLOW_NUM_
325 #undef _SIMD_MASK_MAX_
326 #undef _SIMD_MASK_BIT_
#define __cne_always_inline