CNDP  22.08.0
acl_run_avx512_common.h
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright (c) 2020-2022 Intel Corporation
3  */
4 
5 /*
6  * WARNING: It is not recommended to include this file directly.
7  * Please include "acl_run_avx512x*.h" instead.
8  * To make this file to generate proper code an includer has to
9  * define several macros, refer to "acl_run_avx512x*.h" for more details.
10  */
11 
12 /*
13  * Calculate the address of the next transition for
14  * all types of nodes. Note that only DFA nodes and range
15  * nodes actually transition to another node. Match
16  * nodes not supposed to be encountered here.
17  * For quad range nodes:
18  * Calculate number of range boundaries that are less than the
19  * input value. Range boundaries for each node are in signed 8 bit,
20  * ordered from -128 to 127.
21  * This is effectively a popcnt of bytes that are greater than the
22  * input byte.
23  * Single nodes are processed in the same ways as quad range nodes.
24  */
25 static __cne_always_inline _T_simd
26 _F_(calc_addr)(_T_simd index_mask, _T_simd next_input, _T_simd shuffle_input, _T_simd four_32,
27  _T_simd range_base, _T_simd tr_lo, _T_simd tr_hi)
28 {
29  __mmask64 qm;
30  _T_mask dfa_msk;
31  _T_simd addr, in, node_type, r, t;
32  _T_simd dfa_ofs, quad_ofs;
33 
34  t = _M_SI_(xor)(index_mask, index_mask);
35  in = _M_I_(shuffle_epi8)(next_input, shuffle_input);
36 
37  /* Calc node type and node addr */
38  node_type = _M_SI_(andnot)(index_mask, tr_lo);
39  addr = _M_SI_(and)(index_mask, tr_lo);
40 
41  /* mask for DFA type(0) nodes */
42  dfa_msk = _M_I_(cmpeq_epi32_mask)(node_type, t);
43 
44  /* DFA calculations. */
45  r = _M_I_(srli_epi32)(in, 30);
46  r = _M_I_(add_epi8)(r, range_base);
47  t = _M_I_(srli_epi32)(in, 24);
48  r = _M_I_(shuffle_epi8)(tr_hi, r);
49 
50  dfa_ofs = _M_I_(sub_epi32)(t, r);
51 
52  /* QUAD/SINGLE calculations. */
53  qm = _M_I_(cmpgt_epi8_mask)(in, tr_hi);
54  t = _M_I_(maskz_set1_epi8)(qm, (uint8_t)UINT8_MAX);
55  t = _M_I_(lzcnt_epi32)(t);
56  t = _M_I_(srli_epi32)(t, 3);
57  quad_ofs = _M_I_(sub_epi32)(four_32, t);
58 
59  /* blend DFA and QUAD/SINGLE. */
60  t = _M_I_(mask_mov_epi32)(quad_ofs, dfa_msk, dfa_ofs);
61 
62  /* calculate address for next transitions. */
63  addr = _M_I_(add_epi32)(addr, t);
64  return addr;
65 }
66 
67 /*
68  * Process _N_ transitions in parallel.
69  * tr_lo contains low 32 bits for _N_ transition.
70  * tr_hi contains high 32 bits for _N_ transition.
71  * next_input contains up to 4 input bytes for _N_ flows.
72  */
73 static __cne_always_inline _T_simd
74 _F_(trans)(_T_simd next_input, const uint64_t *trans, _T_simd *tr_lo, _T_simd *tr_hi)
75 {
76  const int32_t *tr;
77  _T_simd addr;
78 
79  tr = (const int32_t *)(uintptr_t)trans;
80 
81  /* Calculate the address (array index) for all _N_ transitions. */
82  addr = _F_(calc_addr)(_SV_(index_mask), next_input, _SV_(shuffle_input), _SV_(four_32),
83  _SV_(range_base), *tr_lo, *tr_hi);
84 
85  /* load lower 32 bits of _N_ transactions at once. */
86  *tr_lo = _M_GI_(i32gather_epi32, addr, tr, sizeof(trans[0]));
87 
88  next_input = _M_I_(srli_epi32)(next_input, CHAR_BIT);
89 
90  /* load high 32 bits of _N_ transactions at once. */
91  *tr_hi = _M_GI_(i32gather_epi32, addr, (tr + 1), sizeof(trans[0]));
92 
93  return next_input;
94 }
95 
96 /*
97  * Execute first transition for up to _N_ flows in parallel.
98  * next_input should contain one input byte for up to _N_ flows.
99  * msk - mask of active flows.
100  * tr_lo contains low 32 bits for up to _N_ transitions.
101  * tr_hi contains high 32 bits for up to _N_ transitions.
102  */
103 static __cne_always_inline void
104 _F_(first_trans)(const struct acl_flow_avx512 *flow, _T_simd next_input, _T_mask msk,
105  _T_simd *tr_lo, _T_simd *tr_hi)
106 {
107  const int32_t *tr;
108  _T_simd addr, root;
109 
110  tr = (const int32_t *)(uintptr_t)flow->trans;
111 
112  addr = _M_I_(set1_epi32)(UINT8_MAX);
113  root = _M_I_(set1_epi32)(flow->root_index);
114 
115  addr = _M_SI_(and)(next_input, addr);
116  addr = _M_I_(add_epi32)(root, addr);
117 
118  /* load lower 32 bits of _N_ transactions at once. */
119  *tr_lo = _M_MGI_(mask_i32gather_epi32)(*tr_lo, msk, addr, tr, sizeof(flow->trans[0]));
120 
121  /* load high 32 bits of _N_ transactions at once. */
122  *tr_hi = _M_MGI_(mask_i32gather_epi32)(*tr_hi, msk, addr, (tr + 1), sizeof(flow->trans[0]));
123 }
124 
125 /*
126  * Load and return next 4 input bytes for up to _N_ flows in parallel.
127  * pdata - 8x2 pointers to flow input data
128  * mask - mask of active flows.
129  * di - data indexes for these _N_ flows.
130  */
131 static inline _T_simd
132 _F_(get_next_bytes)(const struct acl_flow_avx512 *flow, _T_simd pdata[2], uint32_t msk, _T_simd *di,
133  uint32_t bnum)
134 {
135  const int32_t *div;
136  uint32_t m[2];
137  _T_simd one, zero, t, p[2];
138 
139  div = (const int32_t *)flow->data_index;
140 
141  one = _M_I_(set1_epi32)(1);
142  zero = _M_SI_(xor)(one, one);
143 
144  /* load data offsets for given indexes */
145  t = _M_MGI_(mask_i32gather_epi32)(zero, msk, *di, div, sizeof(div[0]));
146 
147  /* increment data indexes */
148  *di = _M_I_(mask_add_epi32)(*di, msk, *di, one);
149 
150  /*
151  * unsigned expand 32-bit indexes to 64-bit
152  * (for later pointer arithmetic), i.e:
153  * for (i = 0; i != _N_; i++)
154  * p[i/8].u64[i%8] = (uint64_t)t.u32[i];
155  */
156  p[0] = _M_I_(maskz_permutexvar_epi32)(_SC_(pmidx_msk), _SV_(pmidx[0]), t);
157  p[1] = _M_I_(maskz_permutexvar_epi32)(_SC_(pmidx_msk), _SV_(pmidx[1]), t);
158 
159  p[0] = _M_I_(add_epi64)(p[0], pdata[0]);
160  p[1] = _M_I_(add_epi64)(p[1], pdata[1]);
161 
162  /* load input byte(s), either one or four */
163 
164  m[0] = msk & _SIMD_PTR_MSK_;
165  m[1] = msk >> _SIMD_PTR_NUM_;
166 
167  return _F_(gather_bytes)(zero, p, m, bnum);
168 }
169 
170 /*
171  * Start up to _N_ new flows.
172  * num - number of flows to start
173  * msk - mask of new flows.
174  * pdata - pointers to flow input data
175  * idx - match indexed for given flows
176  * di - data indexes for these flows.
177  */
178 static inline void
179 _F_(start_flow)(struct acl_flow_avx512 *flow, uint32_t num, uint32_t msk, _T_simd pdata[2],
180  _T_simd *idx, _T_simd *di)
181 {
182  uint32_t n, m[2], nm[2];
183  _T_simd ni, nd[2];
184 
185  /* split mask into two - one for each pdata[] */
186  m[0] = msk & _SIMD_PTR_MSK_;
187  m[1] = msk >> _SIMD_PTR_NUM_;
188 
189  /* calculate masks for new flows */
190  n = __builtin_popcount(m[0]);
191  nm[0] = (1 << n) - 1;
192  nm[1] = (1 << (num - n)) - 1;
193 
194  /* load input data pointers for new flows */
195  nd[0] = _M_I_(maskz_loadu_epi64)(nm[0], flow->idata + flow->num_packets);
196  nd[1] = _M_I_(maskz_loadu_epi64)(nm[1], flow->idata + flow->num_packets + n);
197 
198  /* calculate match indexes of new flows */
199  ni = _M_I_(set1_epi32)(flow->num_packets);
200  ni = _M_I_(add_epi32)(ni, _SV_(idx_add));
201 
202  /* merge new and existing flows data */
203  pdata[0] = _M_I_(mask_expand_epi64)(pdata[0], m[0], nd[0]);
204  pdata[1] = _M_I_(mask_expand_epi64)(pdata[1], m[1], nd[1]);
205 
206  /* update match and data indexes */
207  *idx = _M_I_(mask_expand_epi32)(*idx, msk, ni);
208  *di = _M_I_(maskz_mov_epi32)(msk ^ _SIMD_MASK_MAX_, *di);
209 
210  flow->num_packets += num;
211 }
212 
213 /*
214  * Process found matches for up to _N_ flows.
215  * fmsk - mask of active flows
216  * rmsk - mask of found matches
217  * pdata - pointers to flow input data
218  * di - data indexes for these flows
219  * idx - match indexed for given flows
220  * tr_lo contains low 32 bits for up to _N_ transitions.
221  * tr_hi contains high 32 bits for up to _N_ transitions.
222  */
223 static inline uint32_t
224 _F_(match_process)(struct acl_flow_avx512 *flow, uint32_t *fmsk, uint32_t *rmsk, _T_simd pdata[2],
225  _T_simd *di, _T_simd *idx, _T_simd *tr_lo, _T_simd *tr_hi)
226 {
227  uint32_t n;
228  _T_simd res;
229 
230  if (rmsk[0] == 0)
231  return 0;
232 
233  /* extract match indexes */
234  res = _M_SI_(and)(tr_lo[0], _SV_(index_mask));
235 
236  /* mask matched transitions to nop */
237  tr_lo[0] = _M_I_(mask_mov_epi32)(tr_lo[0], rmsk[0], _SV_(trlo_idle));
238  tr_hi[0] = _M_I_(mask_mov_epi32)(tr_hi[0], rmsk[0], _SV_(trhi_idle));
239 
240  /* save found match indexes */
241  _M_I_(mask_i32scatter_epi32)
242  ((void *)flow->matches, rmsk[0], idx[0], res, sizeof(flow->matches[0]));
243 
244  /* update masks and start new flows for matches */
245  n = update_flow_mask(flow, fmsk, rmsk);
246  _F_(start_flow)(flow, n, rmsk[0], pdata, idx, di);
247 
248  return n;
249 }
250 
251 /*
252  * Test for matches ut to (2 * _N_) flows at once,
253  * if matches exist - process them and start new flows.
254  */
255 static inline void
256 _F_(match_check_process)(struct acl_flow_avx512 *flow, uint32_t fm[2], _T_simd pdata[4],
257  _T_simd di[2], _T_simd idx[2], _T_simd inp[2], _T_simd tr_lo[2],
258  _T_simd tr_hi[2])
259 {
260  uint32_t n[2];
261  uint32_t rm[2];
262 
263  /* check for matches */
264  rm[0] = _M_I_(test_epi32_mask)(tr_lo[0], _SV_(match_mask));
265  rm[1] = _M_I_(test_epi32_mask)(tr_lo[1], _SV_(match_mask));
266 
267  /* till unprocessed matches exist */
268  while ((rm[0] | rm[1]) != 0) {
269 
270  /* process matches and start new flows */
271  n[0] = _F_(match_process)(flow, &fm[0], &rm[0], &pdata[0], &di[0], &idx[0], &tr_lo[0],
272  &tr_hi[0]);
273  n[1] = _F_(match_process)(flow, &fm[1], &rm[1], &pdata[2], &di[1], &idx[1], &tr_lo[1],
274  &tr_hi[1]);
275 
276  /* execute first transition for new flows, if any */
277 
278  if (n[0] != 0) {
279  inp[0] = _F_(get_next_bytes)(flow, &pdata[0], rm[0], &di[0], flow->first_load_sz);
280  _F_(first_trans)(flow, inp[0], rm[0], &tr_lo[0], &tr_hi[0]);
281  rm[0] = _M_I_(test_epi32_mask)(tr_lo[0], _SV_(match_mask));
282  }
283 
284  if (n[1] != 0) {
285  inp[1] = _F_(get_next_bytes)(flow, &pdata[2], rm[1], &di[1], flow->first_load_sz);
286  _F_(first_trans)(flow, inp[1], rm[1], &tr_lo[1], &tr_hi[1]);
287  rm[1] = _M_I_(test_epi32_mask)(tr_lo[1], _SV_(match_mask));
288  }
289  }
290 }
291 
292 /*
293  * Perform search for up to (2 * _N_) flows in parallel.
294  * Use two sets of metadata, each serves _N_ flows max.
295  */
296 static inline void
297 _F_(search_trie)(struct acl_flow_avx512 *flow)
298 {
299  uint32_t fm[2];
300  _T_simd di[2] = {0}, idx[2] = {0}, in[2], pdata[4] = {0}, tr_lo[2] = {0}, tr_hi[2] = {0};
301 
302  /* first 1B load */
303  _F_(start_flow)(flow, _SIMD_MASK_BIT_, _SIMD_MASK_MAX_, &pdata[0], &idx[0], &di[0]);
304  _F_(start_flow)(flow, _SIMD_MASK_BIT_, _SIMD_MASK_MAX_, &pdata[2], &idx[1], &di[1]);
305 
306  in[0] = _F_(get_next_bytes)(flow, &pdata[0], _SIMD_MASK_MAX_, &di[0], flow->first_load_sz);
307  in[1] = _F_(get_next_bytes)(flow, &pdata[2], _SIMD_MASK_MAX_, &di[1], flow->first_load_sz);
308 
309  _F_(first_trans)(flow, in[0], _SIMD_MASK_MAX_, &tr_lo[0], &tr_hi[0]);
310  _F_(first_trans)(flow, in[1], _SIMD_MASK_MAX_, &tr_lo[1], &tr_hi[1]);
311 
312  fm[0] = _SIMD_MASK_MAX_;
313  fm[1] = _SIMD_MASK_MAX_;
314 
315  /* match check */
316  _F_(match_check_process)(flow, fm, pdata, di, idx, in, tr_lo, tr_hi);
317 
318  while ((fm[0] | fm[1]) != 0) {
319 
320  /* load next 4B */
321 
322  in[0] = _F_(get_next_bytes)(flow, &pdata[0], fm[0], &di[0], sizeof(uint32_t));
323  in[1] = _F_(get_next_bytes)(flow, &pdata[2], fm[1], &di[1], sizeof(uint32_t));
324 
325  /* main 4B loop */
326 
327  in[0] = _F_(trans)(in[0], flow->trans, &tr_lo[0], &tr_hi[0]);
328  in[1] = _F_(trans)(in[1], flow->trans, &tr_lo[1], &tr_hi[1]);
329 
330  in[0] = _F_(trans)(in[0], flow->trans, &tr_lo[0], &tr_hi[0]);
331  in[1] = _F_(trans)(in[1], flow->trans, &tr_lo[1], &tr_hi[1]);
332 
333  in[0] = _F_(trans)(in[0], flow->trans, &tr_lo[0], &tr_hi[0]);
334  in[1] = _F_(trans)(in[1], flow->trans, &tr_lo[1], &tr_hi[1]);
335 
336  in[0] = _F_(trans)(in[0], flow->trans, &tr_lo[0], &tr_hi[0]);
337  in[1] = _F_(trans)(in[1], flow->trans, &tr_lo[1], &tr_hi[1]);
338 
339  /* check for matches */
340  _F_(match_check_process)(flow, fm, pdata, di, idx, in, tr_lo, tr_hi);
341  }
342 }
343 
344 /*
345  * resolve match index to actual result/priority offset.
346  */
347 static inline _T_simd
348 _F_(resolve_match_idx)(_T_simd mi)
349 {
350  CNE_BUILD_BUG_ON(sizeof(struct cne_acl_match_results) != 1 << (match_log + 2));
351  return _M_I_(slli_epi32)(mi, match_log);
352 }
353 
354 /*
355  * Resolve multiple matches for the same flow based on priority.
356  */
357 static inline _T_simd
358 _F_(resolve_pri)(const int32_t res[], const int32_t pri[], const uint32_t match[], _T_mask msk,
359  uint32_t nb_trie, uint32_t nb_skip)
360 {
361  uint32_t i;
362  const uint32_t *pm;
363  _T_mask m;
364  _T_simd cp, cr, np, nr, mch;
365 
366  const _T_simd zero = _M_I_(set1_epi32)(0);
367 
368  /* get match indexes */
369  mch = _M_I_(maskz_loadu_epi32)(msk, match);
370  mch = _F_(resolve_match_idx)(mch);
371 
372  /* read result and priority values for first trie */
373  cr = _M_MGI_(mask_i32gather_epi32)(zero, msk, mch, res, sizeof(res[0]));
374  cp = _M_MGI_(mask_i32gather_epi32)(zero, msk, mch, pri, sizeof(pri[0]));
375 
376  /*
377  * read result and priority values for next tries and select one
378  * with highest priority.
379  */
380  for (i = 1, pm = match + nb_skip; i != nb_trie; i++, pm += nb_skip) {
381 
382  mch = _M_I_(maskz_loadu_epi32)(msk, pm);
383  mch = _F_(resolve_match_idx)(mch);
384 
385  nr = _M_MGI_(mask_i32gather_epi32)(zero, msk, mch, res, sizeof(res[0]));
386  np = _M_MGI_(mask_i32gather_epi32)(zero, msk, mch, pri, sizeof(pri[0]));
387 
388  m = _M_I_(cmpgt_epi32_mask)(cp, np);
389  cr = _M_I_(mask_mov_epi32)(nr, m, cr);
390  cp = _M_I_(mask_mov_epi32)(np, m, cp);
391  }
392 
393  return cr;
394 }
395 
396 /*
397  * Resolve num (<= _N_) matches for single category
398  */
399 static inline void
400 _F_(resolve_sc)(uint32_t result[], const int32_t res[], const int32_t pri[], const uint32_t match[],
401  uint32_t nb_pkt, uint32_t nb_trie, uint32_t nb_skip)
402 {
403  _T_mask msk;
404  _T_simd cr;
405 
406  msk = (1 << nb_pkt) - 1;
407  cr = _F_(resolve_pri)(res, pri, match, msk, nb_trie, nb_skip);
408  _M_I_(mask_storeu_epi32)(result, msk, cr);
409 }
410 
411 /*
412  * Resolve matches for single category
413  */
414 static inline void
415 _F_(resolve_single_cat)(uint32_t result[], const struct cne_acl_match_results pr[],
416  const uint32_t match[], uint32_t nb_pkt, uint32_t nb_trie)
417 {
418  uint32_t j, k, n;
419  const int32_t *res, *pri;
420  _T_simd cr[2];
421 
422  res = (const int32_t *)pr->results;
423  pri = pr->priority;
424 
425  for (k = 0; k != (nb_pkt & ~_SIMD_FLOW_MSK_); k += _SIMD_FLOW_NUM_) {
426 
427  j = k + _SIMD_MASK_BIT_;
428 
429  cr[0] = _F_(resolve_pri)(res, pri, match + k, _SIMD_MASK_MAX_, nb_trie, nb_pkt);
430  cr[1] = _F_(resolve_pri)(res, pri, match + j, _SIMD_MASK_MAX_, nb_trie, nb_pkt);
431 
432  _M_SI_(storeu)((void *)(result + k), cr[0]);
433  _M_SI_(storeu)((void *)(result + j), cr[1]);
434  }
435 
436  n = nb_pkt - k;
437  if (n != 0) {
438  if (n > _SIMD_MASK_BIT_) {
439  _F_(resolve_sc)(result + k, res, pri, match + k, _SIMD_MASK_BIT_, nb_trie, nb_pkt);
440  k += _SIMD_MASK_BIT_;
441  n -= _SIMD_MASK_BIT_;
442  }
443  _F_(resolve_sc)(result + k, res, pri, match + k, n, nb_trie, nb_pkt);
444  }
445 }
#define CNE_BUILD_BUG_ON(condition)
Definition: cne_common.h:357
#define __cne_always_inline
Definition: cne_common.h:218