26 _F_(calc_addr)(_T_simd index_mask, _T_simd next_input, _T_simd shuffle_input, _T_simd four_32,
27 _T_simd range_base, _T_simd tr_lo, _T_simd tr_hi)
31 _T_simd addr, in, node_type, r, t;
32 _T_simd dfa_ofs, quad_ofs;
34 t = _M_SI_(xor)(index_mask, index_mask);
35 in = _M_I_(shuffle_epi8)(next_input, shuffle_input);
38 node_type = _M_SI_(andnot)(index_mask, tr_lo);
39 addr = _M_SI_(and)(index_mask, tr_lo);
42 dfa_msk = _M_I_(cmpeq_epi32_mask)(node_type, t);
45 r = _M_I_(srli_epi32)(in, 30);
46 r = _M_I_(add_epi8)(r, range_base);
47 t = _M_I_(srli_epi32)(in, 24);
48 r = _M_I_(shuffle_epi8)(tr_hi, r);
50 dfa_ofs = _M_I_(sub_epi32)(t, r);
53 qm = _M_I_(cmpgt_epi8_mask)(in, tr_hi);
54 t = _M_I_(maskz_set1_epi8)(qm, (uint8_t)UINT8_MAX);
55 t = _M_I_(lzcnt_epi32)(t);
56 t = _M_I_(srli_epi32)(t, 3);
57 quad_ofs = _M_I_(sub_epi32)(four_32, t);
60 t = _M_I_(mask_mov_epi32)(quad_ofs, dfa_msk, dfa_ofs);
63 addr = _M_I_(add_epi32)(addr, t);
74 _F_(trans)(_T_simd next_input,
const uint64_t *trans, _T_simd *tr_lo, _T_simd *tr_hi)
79 tr = (
const int32_t *)(uintptr_t)trans;
82 addr = _F_(calc_addr)(_SV_(index_mask), next_input, _SV_(shuffle_input), _SV_(four_32),
83 _SV_(range_base), *tr_lo, *tr_hi);
86 *tr_lo = _M_GI_(i32gather_epi32, addr, tr,
sizeof(trans[0]));
88 next_input = _M_I_(srli_epi32)(next_input, CHAR_BIT);
91 *tr_hi = _M_GI_(i32gather_epi32, addr, (tr + 1),
sizeof(trans[0]));
104 _F_(first_trans)(
const struct acl_flow_avx512 *flow, _T_simd next_input, _T_mask msk,
105 _T_simd *tr_lo, _T_simd *tr_hi)
110 tr = (
const int32_t *)(uintptr_t)flow->trans;
112 addr = _M_I_(set1_epi32)(UINT8_MAX);
113 root = _M_I_(set1_epi32)(flow->root_index);
115 addr = _M_SI_(and)(next_input, addr);
116 addr = _M_I_(add_epi32)(root, addr);
119 *tr_lo = _M_MGI_(mask_i32gather_epi32)(*tr_lo, msk, addr, tr,
sizeof(flow->trans[0]));
122 *tr_hi = _M_MGI_(mask_i32gather_epi32)(*tr_hi, msk, addr, (tr + 1),
sizeof(flow->trans[0]));
131 static inline _T_simd
132 _F_(get_next_bytes)(
const struct acl_flow_avx512 *flow, _T_simd pdata[2], uint32_t msk, _T_simd *di,
137 _T_simd one, zero, t, p[2];
139 div = (
const int32_t *)flow->data_index;
141 one = _M_I_(set1_epi32)(1);
142 zero = _M_SI_(xor)(one, one);
145 t = _M_MGI_(mask_i32gather_epi32)(zero, msk, *di, div,
sizeof(div[0]));
148 *di = _M_I_(mask_add_epi32)(*di, msk, *di, one);
156 p[0] = _M_I_(maskz_permutexvar_epi32)(_SC_(pmidx_msk), _SV_(pmidx[0]), t);
157 p[1] = _M_I_(maskz_permutexvar_epi32)(_SC_(pmidx_msk), _SV_(pmidx[1]), t);
159 p[0] = _M_I_(add_epi64)(p[0], pdata[0]);
160 p[1] = _M_I_(add_epi64)(p[1], pdata[1]);
164 m[0] = msk & _SIMD_PTR_MSK_;
165 m[1] = msk >> _SIMD_PTR_NUM_;
167 return _F_(gather_bytes)(zero, p, m, bnum);
179 _F_(start_flow)(
struct acl_flow_avx512 *flow, uint32_t num, uint32_t msk, _T_simd pdata[2],
180 _T_simd *idx, _T_simd *di)
182 uint32_t n, m[2], nm[2];
186 m[0] = msk & _SIMD_PTR_MSK_;
187 m[1] = msk >> _SIMD_PTR_NUM_;
190 n = __builtin_popcount(m[0]);
191 nm[0] = (1 << n) - 1;
192 nm[1] = (1 << (num - n)) - 1;
195 nd[0] = _M_I_(maskz_loadu_epi64)(nm[0], flow->idata + flow->num_packets);
196 nd[1] = _M_I_(maskz_loadu_epi64)(nm[1], flow->idata + flow->num_packets + n);
199 ni = _M_I_(set1_epi32)(flow->num_packets);
200 ni = _M_I_(add_epi32)(ni, _SV_(idx_add));
203 pdata[0] = _M_I_(mask_expand_epi64)(pdata[0], m[0], nd[0]);
204 pdata[1] = _M_I_(mask_expand_epi64)(pdata[1], m[1], nd[1]);
207 *idx = _M_I_(mask_expand_epi32)(*idx, msk, ni);
208 *di = _M_I_(maskz_mov_epi32)(msk ^ _SIMD_MASK_MAX_, *di);
210 flow->num_packets += num;
223 static inline uint32_t
224 _F_(match_process)(
struct acl_flow_avx512 *flow, uint32_t *fmsk, uint32_t *rmsk, _T_simd pdata[2],
225 _T_simd *di, _T_simd *idx, _T_simd *tr_lo, _T_simd *tr_hi)
234 res = _M_SI_(and)(tr_lo[0], _SV_(index_mask));
237 tr_lo[0] = _M_I_(mask_mov_epi32)(tr_lo[0], rmsk[0], _SV_(trlo_idle));
238 tr_hi[0] = _M_I_(mask_mov_epi32)(tr_hi[0], rmsk[0], _SV_(trhi_idle));
241 _M_I_(mask_i32scatter_epi32)
242 ((
void *)flow->matches, rmsk[0], idx[0], res,
sizeof(flow->matches[0]));
245 n = update_flow_mask(flow, fmsk, rmsk);
246 _F_(start_flow)(flow, n, rmsk[0], pdata, idx, di);
256 _F_(match_check_process)(
struct acl_flow_avx512 *flow, uint32_t fm[2], _T_simd pdata[4],
257 _T_simd di[2], _T_simd idx[2], _T_simd inp[2], _T_simd tr_lo[2],
264 rm[0] = _M_I_(test_epi32_mask)(tr_lo[0], _SV_(match_mask));
265 rm[1] = _M_I_(test_epi32_mask)(tr_lo[1], _SV_(match_mask));
268 while ((rm[0] | rm[1]) != 0) {
271 n[0] = _F_(match_process)(flow, &fm[0], &rm[0], &pdata[0], &di[0], &idx[0], &tr_lo[0],
273 n[1] = _F_(match_process)(flow, &fm[1], &rm[1], &pdata[2], &di[1], &idx[1], &tr_lo[1],
279 inp[0] = _F_(get_next_bytes)(flow, &pdata[0], rm[0], &di[0], flow->first_load_sz);
280 _F_(first_trans)(flow, inp[0], rm[0], &tr_lo[0], &tr_hi[0]);
281 rm[0] = _M_I_(test_epi32_mask)(tr_lo[0], _SV_(match_mask));
285 inp[1] = _F_(get_next_bytes)(flow, &pdata[2], rm[1], &di[1], flow->first_load_sz);
286 _F_(first_trans)(flow, inp[1], rm[1], &tr_lo[1], &tr_hi[1]);
287 rm[1] = _M_I_(test_epi32_mask)(tr_lo[1], _SV_(match_mask));
297 _F_(search_trie)(
struct acl_flow_avx512 *flow)
300 _T_simd di[2] = {0}, idx[2] = {0}, in[2], pdata[4] = {0}, tr_lo[2] = {0}, tr_hi[2] = {0};
303 _F_(start_flow)(flow, _SIMD_MASK_BIT_, _SIMD_MASK_MAX_, &pdata[0], &idx[0], &di[0]);
304 _F_(start_flow)(flow, _SIMD_MASK_BIT_, _SIMD_MASK_MAX_, &pdata[2], &idx[1], &di[1]);
306 in[0] = _F_(get_next_bytes)(flow, &pdata[0], _SIMD_MASK_MAX_, &di[0], flow->first_load_sz);
307 in[1] = _F_(get_next_bytes)(flow, &pdata[2], _SIMD_MASK_MAX_, &di[1], flow->first_load_sz);
309 _F_(first_trans)(flow, in[0], _SIMD_MASK_MAX_, &tr_lo[0], &tr_hi[0]);
310 _F_(first_trans)(flow, in[1], _SIMD_MASK_MAX_, &tr_lo[1], &tr_hi[1]);
312 fm[0] = _SIMD_MASK_MAX_;
313 fm[1] = _SIMD_MASK_MAX_;
316 _F_(match_check_process)(flow, fm, pdata, di, idx, in, tr_lo, tr_hi);
318 while ((fm[0] | fm[1]) != 0) {
322 in[0] = _F_(get_next_bytes)(flow, &pdata[0], fm[0], &di[0],
sizeof(uint32_t));
323 in[1] = _F_(get_next_bytes)(flow, &pdata[2], fm[1], &di[1],
sizeof(uint32_t));
327 in[0] = _F_(trans)(in[0], flow->trans, &tr_lo[0], &tr_hi[0]);
328 in[1] = _F_(trans)(in[1], flow->trans, &tr_lo[1], &tr_hi[1]);
330 in[0] = _F_(trans)(in[0], flow->trans, &tr_lo[0], &tr_hi[0]);
331 in[1] = _F_(trans)(in[1], flow->trans, &tr_lo[1], &tr_hi[1]);
333 in[0] = _F_(trans)(in[0], flow->trans, &tr_lo[0], &tr_hi[0]);
334 in[1] = _F_(trans)(in[1], flow->trans, &tr_lo[1], &tr_hi[1]);
336 in[0] = _F_(trans)(in[0], flow->trans, &tr_lo[0], &tr_hi[0]);
337 in[1] = _F_(trans)(in[1], flow->trans, &tr_lo[1], &tr_hi[1]);
340 _F_(match_check_process)(flow, fm, pdata, di, idx, in, tr_lo, tr_hi);
347 static inline _T_simd
348 _F_(resolve_match_idx)(_T_simd mi)
350 CNE_BUILD_BUG_ON(
sizeof(
struct cne_acl_match_results) != 1 << (match_log + 2));
351 return _M_I_(slli_epi32)(mi, match_log);
357 static inline _T_simd
358 _F_(resolve_pri)(
const int32_t res[],
const int32_t pri[],
const uint32_t match[], _T_mask msk,
359 uint32_t nb_trie, uint32_t nb_skip)
364 _T_simd cp, cr, np, nr, mch;
366 const _T_simd zero = _M_I_(set1_epi32)(0);
369 mch = _M_I_(maskz_loadu_epi32)(msk, match);
370 mch = _F_(resolve_match_idx)(mch);
373 cr = _M_MGI_(mask_i32gather_epi32)(zero, msk, mch, res,
sizeof(res[0]));
374 cp = _M_MGI_(mask_i32gather_epi32)(zero, msk, mch, pri,
sizeof(pri[0]));
380 for (i = 1, pm = match + nb_skip; i != nb_trie; i++, pm += nb_skip) {
382 mch = _M_I_(maskz_loadu_epi32)(msk, pm);
383 mch = _F_(resolve_match_idx)(mch);
385 nr = _M_MGI_(mask_i32gather_epi32)(zero, msk, mch, res,
sizeof(res[0]));
386 np = _M_MGI_(mask_i32gather_epi32)(zero, msk, mch, pri,
sizeof(pri[0]));
388 m = _M_I_(cmpgt_epi32_mask)(cp, np);
389 cr = _M_I_(mask_mov_epi32)(nr, m, cr);
390 cp = _M_I_(mask_mov_epi32)(np, m, cp);
400 _F_(resolve_sc)(uint32_t result[],
const int32_t res[],
const int32_t pri[],
const uint32_t match[],
401 uint32_t nb_pkt, uint32_t nb_trie, uint32_t nb_skip)
406 msk = (1 << nb_pkt) - 1;
407 cr = _F_(resolve_pri)(res, pri, match, msk, nb_trie, nb_skip);
408 _M_I_(mask_storeu_epi32)(result, msk, cr);
415 _F_(resolve_single_cat)(uint32_t result[],
const struct cne_acl_match_results pr[],
416 const uint32_t match[], uint32_t nb_pkt, uint32_t nb_trie)
419 const int32_t *res, *pri;
422 res = (
const int32_t *)pr->results;
425 for (k = 0; k != (nb_pkt & ~_SIMD_FLOW_MSK_); k += _SIMD_FLOW_NUM_) {
427 j = k + _SIMD_MASK_BIT_;
429 cr[0] = _F_(resolve_pri)(res, pri, match + k, _SIMD_MASK_MAX_, nb_trie, nb_pkt);
430 cr[1] = _F_(resolve_pri)(res, pri, match + j, _SIMD_MASK_MAX_, nb_trie, nb_pkt);
432 _M_SI_(storeu)((
void *)(result + k), cr[0]);
433 _M_SI_(storeu)((
void *)(result + j), cr[1]);
438 if (n > _SIMD_MASK_BIT_) {
439 _F_(resolve_sc)(result + k, res, pri, match + k, _SIMD_MASK_BIT_, nb_trie, nb_pkt);
440 k += _SIMD_MASK_BIT_;
441 n -= _SIMD_MASK_BIT_;
443 _F_(resolve_sc)(result + k, res, pri, match + k, n, nb_trie, nb_pkt);
#define CNE_BUILD_BUG_ON(condition)
#define __cne_always_inline