CNDP  22.08.0
acl_run_sse.h
Go to the documentation of this file.
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright (c) 2010-2022 Intel Corporation
3  */
4 
10 #include <emmintrin.h> // for _mm_loadu_si128, _mm_cvtsi128_si32, _mm_and_...
11 #include <limits.h> // for CHAR_BIT
12 #include <smmintrin.h> // for _mm_blendv_epi8, _mm_insert_epi32, _mm_testz...
13 #include <stdint.h> // for uint64_t, uint32_t, int32_t, uint8_t
14 #include <tmmintrin.h> // for _mm_shuffle_epi8, _mm_maddubs_epi16, _mm_sig...
15 #include <xmmintrin.h> // for __m128, _mm_shuffle_ps
16 
17 #include "acl_run.h" // for acl_flow_data, GET_NEXT_4BYTES, completion
18 #include "acl_vect.h" // for ACL_TR_CALC_ADDR, ACL_TR_HILO
19 #include "acl.h" // for CNE_ACL_NODE_MATCH, cne_acl_match_results
20 #include "cne_acl.h" // for CNE_ACL_RESULTS_MULTIPLIER
21 #include "cne_common.h" // for CNE_DIM, __cne_always_inline
22 #include "cne_vect.h" // for xmm_t, cne_xmm_t
23 
24 enum {
25  SHUFFLE32_SLOT1 = 0xe5,
26  SHUFFLE32_SLOT2 = 0xe6,
27  SHUFFLE32_SLOT3 = 0xe7,
28  SHUFFLE32_SWAP64 = 0x4e,
29 };
30 
31 static const cne_xmm_t xmm_shuffle_input = {
32  .u32 = {0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c},
33 };
34 
35 static const cne_xmm_t xmm_ones_16 = {
36  .u16 = {1, 1, 1, 1, 1, 1, 1, 1},
37 };
38 
39 static const cne_xmm_t xmm_match_mask = {
40  .u32 =
41  {
42  CNE_ACL_NODE_MATCH,
43  CNE_ACL_NODE_MATCH,
44  CNE_ACL_NODE_MATCH,
45  CNE_ACL_NODE_MATCH,
46  },
47 };
48 
49 static const cne_xmm_t xmm_index_mask = {
50  .u32 =
51  {
52  CNE_ACL_NODE_INDEX,
53  CNE_ACL_NODE_INDEX,
54  CNE_ACL_NODE_INDEX,
55  CNE_ACL_NODE_INDEX,
56  },
57 };
58 
59 static const cne_xmm_t xmm_range_base = {
60  .u32 =
61  {
62  0xffffff00,
63  0xffffff04,
64  0xffffff08,
65  0xffffff0c,
66  },
67 };
68 
69 /*
70  * Resolve priority for multiple results (sse version).
71  * This consists comparing the priority of the current traversal with the
72  * running set of results for the packet.
73  * For each result, keep a running array of the result (rule number) and
74  * its priority for each category.
75  */
76 static inline void
77 resolve_priority_sse(uint64_t transition, int n, const struct cne_acl_ctx *ctx, struct parms *parms,
78  const struct cne_acl_match_results *p, uint32_t categories)
79 {
80  uint32_t x;
81  xmm_t results, priority, results1, priority1, selector;
82  xmm_t *saved_results, *saved_priority;
83 
84  for (x = 0; x < categories; x += CNE_ACL_RESULTS_MULTIPLIER) {
85 
86  saved_results = (xmm_t *)(&parms[n].cmplt->results[x]);
87  saved_priority = (xmm_t *)(&parms[n].cmplt->priority[x]);
88 
89  /* get results and priorities for completed trie */
90  results = _mm_loadu_si128((const xmm_t *)&p[transition].results[x]);
91  priority = _mm_loadu_si128((const xmm_t *)&p[transition].priority[x]);
92 
93  /* if this is not the first completed trie */
94  if (parms[n].cmplt->count != ctx->num_tries) {
95 
96  /* get running best results and their priorities */
97  results1 = _mm_loadu_si128(saved_results);
98  priority1 = _mm_loadu_si128(saved_priority);
99 
100  /* select results that are highest priority */
101  selector = _mm_cmpgt_epi32(priority1, priority);
102  results = _mm_blendv_epi8(results, results1, selector);
103  priority = _mm_blendv_epi8(priority, priority1, selector);
104  }
105 
106  /* save running best results and their priorities */
107  _mm_storeu_si128(saved_results, results);
108  _mm_storeu_si128(saved_priority, priority);
109  }
110 }
111 
112 /*
113  * Extract transitions from an XMM register and check for any matches
114  */
115 static void
116 acl_process_matches(xmm_t *indices, int slot, const struct cne_acl_ctx *ctx, struct parms *parms,
117  struct acl_flow_data *flows)
118 {
119  uint64_t transition1, transition2;
120 
121  /* extract transition from low 64 bits. */
122  transition1 = _mm_cvtsi128_si64(*indices);
123 
124  /* extract transition from high 64 bits. */
125  *indices = _mm_shuffle_epi32(*indices, SHUFFLE32_SWAP64);
126  transition2 = _mm_cvtsi128_si64(*indices);
127 
128  transition1 = acl_match_check(transition1, slot, ctx, parms, flows, resolve_priority_sse);
129  transition2 = acl_match_check(transition2, slot + 1, ctx, parms, flows, resolve_priority_sse);
130 
131  /* update indices with new transitions. */
132  *indices = _mm_set_epi64x(transition2, transition1);
133 }
134 
135 /*
136  * Check for any match in 4 transitions (contained in 2 SSE registers)
137  */
138 static __cne_always_inline void
139 acl_match_check_x4(int slot, const struct cne_acl_ctx *ctx, struct parms *parms,
140  struct acl_flow_data *flows, xmm_t *indices1, xmm_t *indices2, xmm_t match_mask)
141 {
142  xmm_t temp;
143 
144  /* put low 32 bits of each transition into one register */
145  temp = (xmm_t)_mm_shuffle_ps((__m128)*indices1, (__m128)*indices2, 0x88);
146  /* test for match node */
147  temp = _mm_and_si128(match_mask, temp);
148 
149  while (!_mm_testz_si128(temp, temp)) {
150  acl_process_matches(indices1, slot, ctx, parms, flows);
151  acl_process_matches(indices2, slot + 2, ctx, parms, flows);
152 
153  temp = (xmm_t)_mm_shuffle_ps((__m128)*indices1, (__m128)*indices2, 0x88);
154  temp = _mm_and_si128(match_mask, temp);
155  }
156 }
157 
158 /*
159  * Process 4 transitions (in 2 XMM registers) in parallel
160  */
161 static __cne_always_inline xmm_t
162 transition4(xmm_t next_input, const uint64_t *trans, xmm_t *indices1, xmm_t *indices2)
163 {
164  xmm_t addr, tr_lo, tr_hi;
165  uint64_t trans0, trans2;
166 
167  /* Shuffle low 32 into tr_lo and high 32 into tr_hi */
168  ACL_TR_HILO(mm, __m128, *indices1, *indices2, tr_lo, tr_hi);
169 
170  /* Calculate the address (array index) for all 4 transitions. */
171  ACL_TR_CALC_ADDR(mm, 128, addr, xmm_index_mask.x, next_input, xmm_shuffle_input.x,
172  xmm_ones_16.x, xmm_range_base.x, tr_lo, tr_hi);
173 
174  /* Gather 64 bit transitions and pack back into 2 registers. */
175 
176  trans0 = trans[_mm_cvtsi128_si32(addr)];
177 
178  /* get slot 2 */
179 
180  /* {x0, x1, x2, x3} -> {x2, x1, x2, x3} */
181  addr = _mm_shuffle_epi32(addr, SHUFFLE32_SLOT2);
182  trans2 = trans[_mm_cvtsi128_si32(addr)];
183 
184  /* get slot 1 */
185 
186  /* {x2, x1, x2, x3} -> {x1, x1, x2, x3} */
187  addr = _mm_shuffle_epi32(addr, SHUFFLE32_SLOT1);
188  *indices1 = _mm_set_epi64x(trans[_mm_cvtsi128_si32(addr)], trans0);
189 
190  /* get slot 3 */
191 
192  /* {x1, x1, x2, x3} -> {x3, x1, x2, x3} */
193  addr = _mm_shuffle_epi32(addr, SHUFFLE32_SLOT3);
194  *indices2 = _mm_set_epi64x(trans[_mm_cvtsi128_si32(addr)], trans2);
195 
196  return _mm_srli_epi32(next_input, CHAR_BIT);
197 }
198 
199 /*
200  * Execute trie traversal with 8 traversals in parallel
201  */
202 static inline int
203 search_sse_8(const struct cne_acl_ctx *ctx, const uint8_t **data, uint32_t *results,
204  uint32_t total_packets, uint32_t categories)
205 {
206  int n;
207  struct acl_flow_data flows;
208  uint64_t index_array[MAX_SEARCHES_SSE8];
209  struct completion cmplt[MAX_SEARCHES_SSE8];
210  struct parms parms[MAX_SEARCHES_SSE8];
211  xmm_t input0, input1;
212  xmm_t indices1, indices2, indices3, indices4;
213 
214  acl_set_flow(&flows, cmplt, CNE_DIM(cmplt), data, results, total_packets, categories,
215  ctx->trans_table);
216 
217  for (n = 0; n < MAX_SEARCHES_SSE8; n++) {
218  cmplt[n].count = 0;
219  index_array[n] = acl_start_next_trie(&flows, parms, n, ctx);
220  }
221 
222  /*
223  * indices1 contains index_array[0,1]
224  * indices2 contains index_array[2,3]
225  * indices3 contains index_array[4,5]
226  * indices4 contains index_array[6,7]
227  */
228 
229  indices1 = _mm_loadu_si128((xmm_t *)&index_array[0]);
230  indices2 = _mm_loadu_si128((xmm_t *)&index_array[2]);
231 
232  indices3 = _mm_loadu_si128((xmm_t *)&index_array[4]);
233  indices4 = _mm_loadu_si128((xmm_t *)&index_array[6]);
234 
235  /* Check for any matches. */
236  acl_match_check_x4(0, ctx, parms, &flows, &indices1, &indices2, xmm_match_mask.x);
237  acl_match_check_x4(4, ctx, parms, &flows, &indices3, &indices4, xmm_match_mask.x);
238 
239  while (flows.started > 0) {
240 
241  /* Gather 4 bytes of input data for each stream. */
242  input0 = _mm_cvtsi32_si128(GET_NEXT_4BYTES(parms, 0));
243  input1 = _mm_cvtsi32_si128(GET_NEXT_4BYTES(parms, 4));
244 
245  input0 = _mm_insert_epi32(input0, GET_NEXT_4BYTES(parms, 1), 1);
246  input1 = _mm_insert_epi32(input1, GET_NEXT_4BYTES(parms, 5), 1);
247 
248  input0 = _mm_insert_epi32(input0, GET_NEXT_4BYTES(parms, 2), 2);
249  input1 = _mm_insert_epi32(input1, GET_NEXT_4BYTES(parms, 6), 2);
250 
251  input0 = _mm_insert_epi32(input0, GET_NEXT_4BYTES(parms, 3), 3);
252  input1 = _mm_insert_epi32(input1, GET_NEXT_4BYTES(parms, 7), 3);
253 
254  /* Process the 4 bytes of input on each stream. */
255 
256  input0 = transition4(input0, flows.trans, &indices1, &indices2);
257  input1 = transition4(input1, flows.trans, &indices3, &indices4);
258 
259  input0 = transition4(input0, flows.trans, &indices1, &indices2);
260  input1 = transition4(input1, flows.trans, &indices3, &indices4);
261 
262  input0 = transition4(input0, flows.trans, &indices1, &indices2);
263  input1 = transition4(input1, flows.trans, &indices3, &indices4);
264 
265  input0 = transition4(input0, flows.trans, &indices1, &indices2);
266  input1 = transition4(input1, flows.trans, &indices3, &indices4);
267 
268  /* Check for any matches. */
269  acl_match_check_x4(0, ctx, parms, &flows, &indices1, &indices2, xmm_match_mask.x);
270  acl_match_check_x4(4, ctx, parms, &flows, &indices3, &indices4, xmm_match_mask.x);
271  }
272 
273  return 0;
274 }
275 
276 /*
277  * Execute trie traversal with 4 traversals in parallel
278  */
279 static inline int
280 search_sse_4(const struct cne_acl_ctx *ctx, const uint8_t **data, uint32_t *results,
281  int total_packets, uint32_t categories)
282 {
283  int n;
284  struct acl_flow_data flows;
285  uint64_t index_array[MAX_SEARCHES_SSE4];
286  struct completion cmplt[MAX_SEARCHES_SSE4];
287  struct parms parms[MAX_SEARCHES_SSE4];
288  xmm_t input, indices1, indices2;
289 
290  acl_set_flow(&flows, cmplt, CNE_DIM(cmplt), data, results, total_packets, categories,
291  ctx->trans_table);
292 
293  for (n = 0; n < MAX_SEARCHES_SSE4; n++) {
294  cmplt[n].count = 0;
295  index_array[n] = acl_start_next_trie(&flows, parms, n, ctx);
296  }
297 
298  indices1 = _mm_loadu_si128((xmm_t *)&index_array[0]);
299  indices2 = _mm_loadu_si128((xmm_t *)&index_array[2]);
300 
301  /* Check for any matches. */
302  acl_match_check_x4(0, ctx, parms, &flows, &indices1, &indices2, xmm_match_mask.x);
303 
304  while (flows.started > 0) {
305 
306  /* Gather 4 bytes of input data for each stream. */
307  input = _mm_cvtsi32_si128(GET_NEXT_4BYTES(parms, 0));
308  input = _mm_insert_epi32(input, GET_NEXT_4BYTES(parms, 1), 1);
309  input = _mm_insert_epi32(input, GET_NEXT_4BYTES(parms, 2), 2);
310  input = _mm_insert_epi32(input, GET_NEXT_4BYTES(parms, 3), 3);
311 
312  /* Process the 4 bytes of input on each stream. */
313  input = transition4(input, flows.trans, &indices1, &indices2);
314  input = transition4(input, flows.trans, &indices1, &indices2);
315  input = transition4(input, flows.trans, &indices1, &indices2);
316  input = transition4(input, flows.trans, &indices1, &indices2);
317 
318  /* Check for any matches. */
319  acl_match_check_x4(0, ctx, parms, &flows, &indices1, &indices2, xmm_match_mask.x);
320  }
321 
322  return 0;
323 }
#define CNE_DIM(a)
Definition: cne_common.h:778
#define __cne_always_inline
Definition: cne_common.h:218