CNDP  22.08.0
acl_run_avx512x8.h
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright (c) 2020-2022 Intel Corporation
3  */
4 
5 /*
6  * Defines required by "acl_run_avx512_common.h".
7  * Note that all of them has to be undefined by the end
8  * of this file, as "acl_run_avx512_common.h" can be included several
9  * times from different *.h files for the same *.c.
10  */
11 
12 /*
13  * This implementation uses 256-bit registers(ymm) and instrincts.
14  * So our main SIMD type is 256-bit width and each such variable can
15  * process sizeof(__m256i) / sizeof(uint32_t) == 8 entries in parallel.
16  */
17 #define _T_simd __m256i
18 #define _T_mask __mmask8
19 
20 /* Naming convention for static const variables. */
21 #define _SC_(x) ymm_##x
22 #define _SV_(x) (ymm_##x.y)
23 
24 /* Naming convention for internal functions. */
25 #define _F_(x) x##_avx512x8
26 
27 /*
28  * Same instrincts have different syntaxis (depending on the bit-width),
29  * so to overcome that few macros need to be defined.
30  */
31 
32 /* Naming convention for generic epi(packed integers) type instrincts. */
33 #define _M_I_(x) _mm256_##x
34 
35 /* Naming convention for si(whole simd integer) type instrincts. */
36 #define _M_SI_(x) _mm256_##x##_si256
37 
38 /* Naming convention for masked gather type instrincts. */
39 #define _M_MGI_(x) _mm256_m##x
40 
41 /* Naming convention for gather type instrincts. */
42 #define _M_GI_(name, idx, base, scale) _mm256_##name(base, idx, scale)
43 
44 /* num/mask of transitions per SIMD regs */
45 #define _SIMD_MASK_BIT_ (sizeof(_T_simd) / sizeof(uint32_t))
46 #define _SIMD_MASK_MAX_ CNE_LEN2MASK(_SIMD_MASK_BIT_, uint32_t)
47 
48 #define _SIMD_FLOW_NUM_ (2 * _SIMD_MASK_BIT_)
49 #define _SIMD_FLOW_MSK_ (_SIMD_FLOW_NUM_ - 1)
50 
51 /* num/mask of pointers per SIMD regs */
52 #define _SIMD_PTR_NUM_ (sizeof(_T_simd) / sizeof(uintptr_t))
53 #define _SIMD_PTR_MSK_ CNE_LEN2MASK(_SIMD_PTR_NUM_, uint32_t)
54 
55 static const cne_ymm_t _SC_(match_mask) = {
56  .u32 =
57  {
58  CNE_ACL_NODE_MATCH,
59  CNE_ACL_NODE_MATCH,
60  CNE_ACL_NODE_MATCH,
61  CNE_ACL_NODE_MATCH,
62  CNE_ACL_NODE_MATCH,
63  CNE_ACL_NODE_MATCH,
64  CNE_ACL_NODE_MATCH,
65  CNE_ACL_NODE_MATCH,
66  },
67 };
68 
69 static const cne_ymm_t _SC_(index_mask) = {
70  .u32 =
71  {
72  CNE_ACL_NODE_INDEX,
73  CNE_ACL_NODE_INDEX,
74  CNE_ACL_NODE_INDEX,
75  CNE_ACL_NODE_INDEX,
76  CNE_ACL_NODE_INDEX,
77  CNE_ACL_NODE_INDEX,
78  CNE_ACL_NODE_INDEX,
79  CNE_ACL_NODE_INDEX,
80  },
81 };
82 
83 static const cne_ymm_t _SC_(trlo_idle) = {
84  .u32 =
85  {
86  CNE_ACL_IDLE_NODE,
87  CNE_ACL_IDLE_NODE,
88  CNE_ACL_IDLE_NODE,
89  CNE_ACL_IDLE_NODE,
90  CNE_ACL_IDLE_NODE,
91  CNE_ACL_IDLE_NODE,
92  CNE_ACL_IDLE_NODE,
93  CNE_ACL_IDLE_NODE,
94  },
95 };
96 
97 // clang-format off
98 static const cne_ymm_t _SC_(trhi_idle) = {
99  .u32 = {
100  0, 0, 0, 0,
101  0, 0, 0, 0,
102  },
103 };
104 
105 static const cne_ymm_t _SC_(shuffle_input) = {
106  .u32 = {
107  0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,
108  0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,
109  },
110 };
111 
112 static const cne_ymm_t _SC_(four_32) = {
113  .u32 = {
114  4, 4, 4, 4,
115  4, 4, 4, 4,
116  },
117 };
118 
119 static const cne_ymm_t _SC_(idx_add) = {
120  .u32 = {
121  0, 1, 2, 3,
122  4, 5, 6, 7,
123  },
124 };
125 
126 static const cne_ymm_t _SC_(range_base) = {
127  .u32 = {
128  0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,
129  0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,
130  },
131 };
132 
133 static const cne_ymm_t _SC_(pminp) = {
134  .u32 = {
135  0x00, 0x01, 0x02, 0x03,
136  0x08, 0x09, 0x0a, 0x0b,
137  },
138 };
139 
140 static const __mmask16 _SC_(pmidx_msk) = 0x55;
141 
142 static const cne_ymm_t _SC_(pmidx[2]) = {
143  [0] = {
144  .u32 = {
145  0, 0, 1, 0, 2, 0, 3, 0,
146  },
147  },
148  [1] = {
149  .u32 = {
150  4, 0, 5, 0, 6, 0, 7, 0,
151  },
152  },
153 };
154 // clang-format on
155 
156 /*
157  * unfortunately current AVX512 ISA doesn't provide ability for
158  * gather load on a byte quantity. So we have to mimic it in SW,
159  * by doing 4x1B scalar loads.
160  */
161 static inline __m128i
162 _m256_mask_gather_epi8x4(__m256i pdata, __mmask8 mask)
163 {
164  cne_xmm_t v;
165  cne_ymm_t p;
166 
167  static const uint32_t zero;
168 
169  p.y = _mm256_mask_set1_epi64(pdata, mask ^ _SIMD_PTR_MSK_, (uintptr_t)&zero);
170 
171  v.u32[0] = *(uint8_t *)p.u64[0];
172  v.u32[1] = *(uint8_t *)p.u64[1];
173  v.u32[2] = *(uint8_t *)p.u64[2];
174  v.u32[3] = *(uint8_t *)p.u64[3];
175 
176  return v.x;
177 }
178 
179 /*
180  * Gather 4/1 input bytes for up to 8 (2*8) locations in parallel.
181  */
182 static __cne_always_inline __m256i
183 _F_(gather_bytes)(__m256i zero, const __m256i p[2], const uint32_t m[2], uint32_t bnum)
184 {
185  __m128i inp[2];
186 
187  if (bnum == sizeof(uint8_t)) {
188  inp[0] = _m256_mask_gather_epi8x4(p[0], m[0]);
189  inp[1] = _m256_mask_gather_epi8x4(p[1], m[1]);
190  } else {
191  inp[0] = _mm256_mmask_i64gather_epi32(_mm256_castsi256_si128(zero), m[0], p[0], NULL,
192  sizeof(uint8_t));
193  inp[1] = _mm256_mmask_i64gather_epi32(_mm256_castsi256_si128(zero), m[1], p[1], NULL,
194  sizeof(uint8_t));
195  }
196 
197  /* squeeze input into one 256-bit register */
198  return _mm256_permutex2var_epi32(_mm256_castsi128_si256(inp[0]), _SV_(pminp),
199  _mm256_castsi128_si256(inp[1]));
200 }
201 
202 #include "acl_run_avx512_common.h"
203 
204 /*
205  * Perform search for up to (2 * 8) flows in parallel.
206  * Use two sets of metadata, each serves 8 flows max.
207  */
208 static inline int
209 search_avx512x8x2(const struct cne_acl_ctx *ctx, const uint8_t **data, uint32_t *results,
210  uint32_t total_packets, uint32_t categories)
211 {
212  uint32_t i, *pm;
213  const struct cne_acl_match_results *pr;
214  struct acl_flow_avx512 flow;
215  uint32_t match[ctx->num_tries * total_packets];
216 
217  for (i = 0, pm = match; i != ctx->num_tries; i++, pm += total_packets) {
218 
219  /* setup for next trie */
220  acl_set_flow_avx512(&flow, ctx, i, data, pm, total_packets);
221 
222  /* process the trie */
223  _F_(search_trie)(&flow);
224  }
225 
226  /* resolve matches */
227  pr = (const struct cne_acl_match_results *)(ctx->trans_table + ctx->match_index);
228 
229  if (categories == 1)
230  _F_(resolve_single_cat)(results, pr, match, total_packets, ctx->num_tries);
231  else
232  resolve_mcle8_avx512x1(results, pr, match, total_packets, categories, ctx->num_tries);
233 
234  return 0;
235 }
236 
237 #undef _SIMD_PTR_MSK_
238 #undef _SIMD_PTR_NUM_
239 #undef _SIMD_FLOW_MSK_
240 #undef _SIMD_FLOW_NUM_
241 #undef _SIMD_MASK_MAX_
242 #undef _SIMD_MASK_BIT_
243 #undef _M_GI_
244 #undef _M_MGI_
245 #undef _M_SI_
246 #undef _M_I_
247 #undef _F_
248 #undef _SV_
249 #undef _SC_
250 #undef _T_mask
251 #undef _T_simd
#define __cne_always_inline
Definition: cne_common.h:218