CNDP  22.08.0
acl_run_avx512x16.h
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright (c) 2020-2022 Intel Corporation
3  */
4 
5 /*
6  * Defines required by "acl_run_avx512_common.h".
7  * Note that all of them has to be undefined by the end
8  * of this file, as "acl_run_avx512_common.h" can be included several
9  * times from different *.h files for the same *.c.
10  */
11 
12 /*
13  * This implementation uses 512-bit registers(zmm) and instrincts.
14  * So our main SIMD type is 512-bit width and each such variable can
15  * process sizeof(__m512i) / sizeof(uint32_t) == 16 entries in parallel.
16  */
17 #define _T_simd __m512i
18 #define _T_mask __mmask16
19 
20 /* Naming convention for static const variables. */
21 #define _SC_(x) zmm_##x
22 #define _SV_(x) (zmm_##x.z)
23 
24 /* Naming convention for internal functions. */
25 #define _F_(x) x##_avx512x16
26 
27 /*
28  * Same instrincts have different syntaxis (depending on the bit-width),
29  * so to overcome that few macros need to be defined.
30  */
31 
32 /* Naming convention for generic epi(packed integers) type instrincts. */
33 #define _M_I_(x) _mm512_##x
34 
35 /* Naming convention for si(whole simd integer) type instrincts. */
36 #define _M_SI_(x) _mm512_##x##_si512
37 
38 /* Naming convention for masked gather type instrincts. */
39 #define _M_MGI_(x) _mm512_##x
40 
41 /* Naming convention for gather type instrincts. */
42 #define _M_GI_(name, idx, base, scale) _mm512_##name(idx, base, scale)
43 
44 /* num/mask of transitions per SIMD regs */
45 #define _SIMD_MASK_BIT_ (sizeof(_T_simd) / sizeof(uint32_t))
46 #define _SIMD_MASK_MAX_ CNE_LEN2MASK(_SIMD_MASK_BIT_, uint32_t)
47 
48 #define _SIMD_FLOW_NUM_ (2 * _SIMD_MASK_BIT_)
49 #define _SIMD_FLOW_MSK_ (_SIMD_FLOW_NUM_ - 1)
50 
51 /* num/mask of pointers per SIMD regs */
52 #define _SIMD_PTR_NUM_ (sizeof(_T_simd) / sizeof(uintptr_t))
53 #define _SIMD_PTR_MSK_ CNE_LEN2MASK(_SIMD_PTR_NUM_, uint32_t)
54 
55 static const __cne_x86_zmm_t _SC_(match_mask) = {
56  .u32 =
57  {
58  CNE_ACL_NODE_MATCH,
59  CNE_ACL_NODE_MATCH,
60  CNE_ACL_NODE_MATCH,
61  CNE_ACL_NODE_MATCH,
62  CNE_ACL_NODE_MATCH,
63  CNE_ACL_NODE_MATCH,
64  CNE_ACL_NODE_MATCH,
65  CNE_ACL_NODE_MATCH,
66  CNE_ACL_NODE_MATCH,
67  CNE_ACL_NODE_MATCH,
68  CNE_ACL_NODE_MATCH,
69  CNE_ACL_NODE_MATCH,
70  CNE_ACL_NODE_MATCH,
71  CNE_ACL_NODE_MATCH,
72  CNE_ACL_NODE_MATCH,
73  CNE_ACL_NODE_MATCH,
74  },
75 };
76 
77 static const __cne_x86_zmm_t _SC_(index_mask) = {
78  .u32 =
79  {
80  CNE_ACL_NODE_INDEX,
81  CNE_ACL_NODE_INDEX,
82  CNE_ACL_NODE_INDEX,
83  CNE_ACL_NODE_INDEX,
84  CNE_ACL_NODE_INDEX,
85  CNE_ACL_NODE_INDEX,
86  CNE_ACL_NODE_INDEX,
87  CNE_ACL_NODE_INDEX,
88  CNE_ACL_NODE_INDEX,
89  CNE_ACL_NODE_INDEX,
90  CNE_ACL_NODE_INDEX,
91  CNE_ACL_NODE_INDEX,
92  CNE_ACL_NODE_INDEX,
93  CNE_ACL_NODE_INDEX,
94  CNE_ACL_NODE_INDEX,
95  CNE_ACL_NODE_INDEX,
96  },
97 };
98 
99 static const __cne_x86_zmm_t _SC_(trlo_idle) = {
100  .u32 =
101  {
102  CNE_ACL_IDLE_NODE,
103  CNE_ACL_IDLE_NODE,
104  CNE_ACL_IDLE_NODE,
105  CNE_ACL_IDLE_NODE,
106  CNE_ACL_IDLE_NODE,
107  CNE_ACL_IDLE_NODE,
108  CNE_ACL_IDLE_NODE,
109  CNE_ACL_IDLE_NODE,
110  CNE_ACL_IDLE_NODE,
111  CNE_ACL_IDLE_NODE,
112  CNE_ACL_IDLE_NODE,
113  CNE_ACL_IDLE_NODE,
114  CNE_ACL_IDLE_NODE,
115  CNE_ACL_IDLE_NODE,
116  CNE_ACL_IDLE_NODE,
117  CNE_ACL_IDLE_NODE,
118  },
119 };
120 
121 // clang-format off
122 static const __cne_x86_zmm_t _SC_(trhi_idle) = {
123  .u32 = {
124  0, 0, 0, 0,
125  0, 0, 0, 0,
126  0, 0, 0, 0,
127  0, 0, 0, 0,
128  },
129 };
130 
131 static const __cne_x86_zmm_t _SC_(shuffle_input) = {
132  .u32 = {
133  0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,
134  0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,
135  0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,
136  0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,
137  },
138 };
139 
140 static const __cne_x86_zmm_t _SC_(four_32) = {
141  .u32 = {
142  4, 4, 4, 4,
143  4, 4, 4, 4,
144  4, 4, 4, 4,
145  4, 4, 4, 4,
146  },
147 };
148 
149 static const __cne_x86_zmm_t _SC_(idx_add) = {
150  .u32 = {
151  0, 1, 2, 3,
152  4, 5, 6, 7,
153  8, 9, 10, 11,
154  12, 13, 14, 15,
155  },
156 };
157 
158 static const __cne_x86_zmm_t _SC_(range_base) = {
159  .u32 = {
160  0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,
161  0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,
162  0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,
163  0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,
164  },
165 };
166 
167 static const __cne_x86_zmm_t _SC_(pminp) = {
168  .u32 = {
169  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
170  0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
171  },
172 };
173 
174 static const _T_mask _SC_(pmidx_msk) = 0x5555;
175 
176 static const __cne_x86_zmm_t _SC_(pmidx[2]) = {
177  [0] = {
178  .u32 = {
179  0, 0, 1, 0, 2, 0, 3, 0,
180  4, 0, 5, 0, 6, 0, 7, 0,
181  },
182  },
183  [1] = {
184  .u32 = {
185  8, 0, 9, 0, 10, 0, 11, 0,
186  12, 0, 13, 0, 14, 0, 15, 0,
187  },
188  },
189 };
190 // clang-format on
191 /*
192  * unfortunately current AVX512 ISA doesn't provide ability for
193  * gather load on a byte quantity. So we have to mimic it in SW,
194  * by doing 8x1B scalar loads.
195  */
196 static inline __m256i
197 _m512_mask_gather_epi8x8(__m512i pdata, __mmask8 mask)
198 {
199  cne_ymm_t v;
200  __cne_x86_zmm_t p;
201 
202  static const uint32_t zero;
203 
204  p.z = _mm512_mask_set1_epi64(pdata, mask ^ _SIMD_PTR_MSK_, (uintptr_t)&zero);
205 
206  v.u32[0] = *(uint8_t *)p.u64[0];
207  v.u32[1] = *(uint8_t *)p.u64[1];
208  v.u32[2] = *(uint8_t *)p.u64[2];
209  v.u32[3] = *(uint8_t *)p.u64[3];
210  v.u32[4] = *(uint8_t *)p.u64[4];
211  v.u32[5] = *(uint8_t *)p.u64[5];
212  v.u32[6] = *(uint8_t *)p.u64[6];
213  v.u32[7] = *(uint8_t *)p.u64[7];
214 
215  return v.y;
216 }
217 
218 /*
219  * Gather 4/1 input bytes for up to 16 (2*8) locations in parallel.
220  */
221 static __cne_always_inline __m512i
222 _F_(gather_bytes)(__m512i zero, const __m512i p[2], const uint32_t m[2], uint32_t bnum)
223 {
224  __m256i inp[2];
225 
226  if (bnum == sizeof(uint8_t)) {
227  inp[0] = _m512_mask_gather_epi8x8(p[0], m[0]);
228  inp[1] = _m512_mask_gather_epi8x8(p[1], m[1]);
229  } else {
230  inp[0] = _mm512_mask_i64gather_epi32(_mm512_castsi512_si256(zero), m[0], p[0], NULL,
231  sizeof(uint8_t));
232  inp[1] = _mm512_mask_i64gather_epi32(_mm512_castsi512_si256(zero), m[1], p[1], NULL,
233  sizeof(uint8_t));
234  }
235 
236  /* squeeze input into one 512-bit register */
237  return _mm512_permutex2var_epi32(_mm512_castsi256_si512(inp[0]), _SV_(pminp),
238  _mm512_castsi256_si512(inp[1]));
239 }
240 
241 /*
242  * Resolve matches for multiple categories (GT 8, use 512b instuctions/regs)
243  */
244 static inline void
245 resolve_mcgt8_avx512x1(uint32_t result[], const struct cne_acl_match_results pr[],
246  const uint32_t match[], uint32_t nb_pkt, uint32_t nb_cat, uint32_t nb_trie)
247 {
248  const int32_t *pri;
249  const uint32_t *pm, *res;
250  uint32_t i, k, mi;
251  __mmask16 cm, sm;
252  __m512i cp, cr, np, nr;
253 
254  const uint32_t match_log = 5;
255 
256  res = pr->results;
257  pri = pr->priority;
258 
259  cm = (1 << nb_cat) - 1;
260 
261  for (k = 0; k != nb_pkt; k++, result += nb_cat) {
262 
263  mi = match[k] << match_log;
264 
265  cr = _mm512_maskz_loadu_epi32(cm, res + mi);
266  cp = _mm512_maskz_loadu_epi32(cm, pri + mi);
267 
268  for (i = 1, pm = match + nb_pkt; i != nb_trie; i++, pm += nb_pkt) {
269 
270  mi = pm[k] << match_log;
271 
272  nr = _mm512_maskz_loadu_epi32(cm, res + mi);
273  np = _mm512_maskz_loadu_epi32(cm, pri + mi);
274 
275  sm = _mm512_cmpgt_epi32_mask(cp, np);
276  cr = _mm512_mask_mov_epi32(nr, sm, cr);
277  cp = _mm512_mask_mov_epi32(np, sm, cp);
278  }
279 
280  _mm512_mask_storeu_epi32(result, cm, cr);
281  }
282 }
283 
284 #include "acl_run_avx512_common.h"
285 
286 /*
287  * Perform search for up to (2 * 16) flows in parallel.
288  * Use two sets of metadata, each serves 16 flows max.
289  */
290 static inline int
291 search_avx512x16x2(const struct cne_acl_ctx *ctx, const uint8_t **data, uint32_t *results,
292  uint32_t total_packets, uint32_t categories)
293 {
294  uint32_t i, *pm;
295  const struct cne_acl_match_results *pr;
296  struct acl_flow_avx512 flow;
297  uint32_t match[ctx->num_tries * total_packets];
298 
299  for (i = 0, pm = match; i != ctx->num_tries; i++, pm += total_packets) {
300 
301  /* setup for next trie */
302  acl_set_flow_avx512(&flow, ctx, i, data, pm, total_packets);
303 
304  /* process the trie */
305  _F_(search_trie)(&flow);
306  }
307 
308  /* resolve matches */
309  pr = (const struct cne_acl_match_results *)(ctx->trans_table + ctx->match_index);
310 
311  if (categories == 1)
312  _F_(resolve_single_cat)(results, pr, match, total_packets, ctx->num_tries);
313  else if (categories <= CNE_ACL_MAX_CATEGORIES / 2)
314  resolve_mcle8_avx512x1(results, pr, match, total_packets, categories, ctx->num_tries);
315  else
316  resolve_mcgt8_avx512x1(results, pr, match, total_packets, categories, ctx->num_tries);
317 
318  return 0;
319 }
320 
321 #undef _SIMD_PTR_MSK_
322 #undef _SIMD_PTR_NUM_
323 #undef _SIMD_FLOW_MSK_
324 #undef _SIMD_FLOW_NUM_
325 #undef _SIMD_MASK_MAX_
326 #undef _SIMD_MASK_BIT_
327 #undef _M_GI_
328 #undef _M_MGI_
329 #undef _M_SI_
330 #undef _M_I_
331 #undef _F_
332 #undef _SV_
333 #undef _SC_
334 #undef _T_mask
335 #undef _T_simd
#define __cne_always_inline
Definition: cne_common.h:218