CNDP  22.08.0
acl_run_avx2.h
Go to the documentation of this file.
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright (c) 2010-2022 Intel Corporation
3  */
4 
10 #include <emmintrin.h> // for _mm_cvtsi128_si32, _mm_srli_si128
11 #include <immintrin.h> // for _mm256_set_epi64x, __m256, _mm256_and_si256
12 #include <limits.h> // for CHAR_BIT
13 #include <stdint.h> // for uint32_t, uint64_t, int32_t, uint8_t, uintp...
14 
15 #include "acl_run_sse.h" // for resolve_priority_sse
16 #include "acl.h" // for CNE_ACL_NODE_MATCH, cne_acl_ctx
17 #include "acl_run.h" // for GET_NEXT_4BYTES, acl_flow_data, CNE_ACL_NOD...
18 #include "acl_vect.h" // for ACL_TR_HILO, ACL_TR_CALC_ADDR
19 #include "cne_common.h" // for CNE_DIM, __cne_always_inline
20 #include "cne_vect.h" // for ymm_t, cne_ymm_t, xmm_t
21 
22 static const cne_ymm_t ymm_match_mask = {
23  .u32 =
24  {
25  CNE_ACL_NODE_MATCH,
26  CNE_ACL_NODE_MATCH,
27  CNE_ACL_NODE_MATCH,
28  CNE_ACL_NODE_MATCH,
29  CNE_ACL_NODE_MATCH,
30  CNE_ACL_NODE_MATCH,
31  CNE_ACL_NODE_MATCH,
32  CNE_ACL_NODE_MATCH,
33  },
34 };
35 
36 static const cne_ymm_t ymm_index_mask = {
37  .u32 =
38  {
39  CNE_ACL_NODE_INDEX,
40  CNE_ACL_NODE_INDEX,
41  CNE_ACL_NODE_INDEX,
42  CNE_ACL_NODE_INDEX,
43  CNE_ACL_NODE_INDEX,
44  CNE_ACL_NODE_INDEX,
45  CNE_ACL_NODE_INDEX,
46  CNE_ACL_NODE_INDEX,
47  },
48 };
49 
50 static const cne_ymm_t ymm_shuffle_input = {
51  .u32 =
52  {
53  0x00000000,
54  0x04040404,
55  0x08080808,
56  0x0c0c0c0c,
57  0x00000000,
58  0x04040404,
59  0x08080808,
60  0x0c0c0c0c,
61  },
62 };
63 
64 static const cne_ymm_t ymm_ones_16 = {
65  .u16 =
66  {
67  1,
68  1,
69  1,
70  1,
71  1,
72  1,
73  1,
74  1,
75  1,
76  1,
77  1,
78  1,
79  1,
80  1,
81  1,
82  1,
83  },
84 };
85 
86 static const cne_ymm_t ymm_range_base = {
87  .u32 =
88  {
89  0xffffff00,
90  0xffffff04,
91  0xffffff08,
92  0xffffff0c,
93  0xffffff00,
94  0xffffff04,
95  0xffffff08,
96  0xffffff0c,
97  },
98 };
99 
100 /*
101  * Process 8 transitions in parallel.
102  * tr_lo contains low 32 bits for 8 transition.
103  * tr_hi contains high 32 bits for 8 transition.
104  * next_input contains up to 4 input bytes for 8 flows.
105  */
106 static __cne_always_inline ymm_t
107 transition8(ymm_t next_input, const uint64_t *trans, ymm_t *tr_lo, ymm_t *tr_hi)
108 {
109  const int32_t *tr;
110  ymm_t addr;
111 
112  tr = (const int32_t *)(uintptr_t)trans;
113 
114  /* Calculate the address (array index) for all 8 transitions. */
115  ACL_TR_CALC_ADDR(mm256, 256, addr, ymm_index_mask.y, next_input, ymm_shuffle_input.y,
116  ymm_ones_16.y, ymm_range_base.y, *tr_lo, *tr_hi);
117 
118  /* load lower 32 bits of 8 transactions at once. */
119  *tr_lo = _mm256_i32gather_epi32(tr, addr, sizeof(trans[0]));
120 
121  next_input = _mm256_srli_epi32(next_input, CHAR_BIT);
122 
123  /* load high 32 bits of 8 transactions at once. */
124  *tr_hi = _mm256_i32gather_epi32(tr + 1, addr, sizeof(trans[0]));
125 
126  return next_input;
127 }
128 
129 /*
130  * Process matches for 8 flows.
131  * tr_lo contains low 32 bits for 8 transition.
132  * tr_hi contains high 32 bits for 8 transition.
133  */
134 static inline void
135 acl_process_matches_avx2x8(const struct cne_acl_ctx *ctx, struct parms *parms,
136  struct acl_flow_data *flows, uint32_t slot, ymm_t matches, ymm_t *tr_lo,
137  ymm_t *tr_hi)
138 {
139  ymm_t t0, t1;
140  ymm_t lo, hi;
141  xmm_t l0, l1;
142  uint32_t i;
143  uint64_t tr[MAX_SEARCHES_SSE8];
144 
145  l1 = _mm256_extracti128_si256(*tr_lo, 1);
146  l0 = _mm256_castsi256_si128(*tr_lo);
147 
148  for (i = 0; i != CNE_DIM(tr) / 2; i++) {
149 
150  /*
151  * Extract low 32bits of each transition.
152  * That's enough to process the match.
153  */
154  tr[i] = (uint32_t)_mm_cvtsi128_si32(l0);
155  tr[i + 4] = (uint32_t)_mm_cvtsi128_si32(l1);
156 
157  l0 = _mm_srli_si128(l0, sizeof(uint32_t));
158  l1 = _mm_srli_si128(l1, sizeof(uint32_t));
159 
160  tr[i] = acl_match_check(tr[i], slot + i, ctx, parms, flows, resolve_priority_sse);
161  tr[i + 4] =
162  acl_match_check(tr[i + 4], slot + i + 4, ctx, parms, flows, resolve_priority_sse);
163  }
164 
165  /* Collect new transitions into 2 YMM registers. */
166  t0 = _mm256_set_epi64x(tr[5], tr[4], tr[1], tr[0]);
167  t1 = _mm256_set_epi64x(tr[7], tr[6], tr[3], tr[2]);
168 
169  /* For each transition: put low 32 into tr_lo and high 32 into tr_hi */
170  ACL_TR_HILO(mm256, __m256, t0, t1, lo, hi);
171 
172  /* Keep transitions with NOMATCH intact. */
173  *tr_lo = _mm256_blendv_epi8(*tr_lo, lo, matches);
174  *tr_hi = _mm256_blendv_epi8(*tr_hi, hi, matches);
175 }
176 
177 static inline void
178 acl_match_check_avx2x8(const struct cne_acl_ctx *ctx, struct parms *parms,
179  struct acl_flow_data *flows, uint32_t slot, ymm_t *tr_lo, ymm_t *tr_hi,
180  ymm_t match_mask)
181 {
182  uint32_t msk;
183  ymm_t matches, temp;
184 
185  /* test for match node */
186  temp = _mm256_and_si256(match_mask, *tr_lo);
187  matches = _mm256_cmpeq_epi32(temp, match_mask);
188  msk = _mm256_movemask_epi8(matches);
189 
190  while (msk != 0) {
191 
192  acl_process_matches_avx2x8(ctx, parms, flows, slot, matches, tr_lo, tr_hi);
193  temp = _mm256_and_si256(match_mask, *tr_lo);
194  matches = _mm256_cmpeq_epi32(temp, match_mask);
195  msk = _mm256_movemask_epi8(matches);
196  }
197 }
198 
199 /*
200  * Execute trie traversal for up to 16 flows in parallel.
201  */
202 static inline int
203 search_avx2x16(const struct cne_acl_ctx *ctx, const uint8_t **data, uint32_t *results,
204  uint32_t total_packets, uint32_t categories)
205 {
206  uint32_t n;
207  struct acl_flow_data flows;
208  uint64_t index_array[MAX_SEARCHES_AVX16];
209  struct completion cmplt[MAX_SEARCHES_AVX16];
210  struct parms parms[MAX_SEARCHES_AVX16];
211  ymm_t input[2], tr_lo[2], tr_hi[2];
212  ymm_t t0, t1;
213 
214  acl_set_flow(&flows, cmplt, CNE_DIM(cmplt), data, results, total_packets, categories,
215  ctx->trans_table);
216 
217  for (n = 0; n < CNE_DIM(cmplt); n++) {
218  cmplt[n].count = 0;
219  index_array[n] = acl_start_next_trie(&flows, parms, n, ctx);
220  }
221 
222  t0 = _mm256_set_epi64x(index_array[5], index_array[4], index_array[1], index_array[0]);
223  t1 = _mm256_set_epi64x(index_array[7], index_array[6], index_array[3], index_array[2]);
224 
225  ACL_TR_HILO(mm256, __m256, t0, t1, tr_lo[0], tr_hi[0]);
226 
227  t0 = _mm256_set_epi64x(index_array[13], index_array[12], index_array[9], index_array[8]);
228  t1 = _mm256_set_epi64x(index_array[15], index_array[14], index_array[11], index_array[10]);
229 
230  ACL_TR_HILO(mm256, __m256, t0, t1, tr_lo[1], tr_hi[1]);
231 
232  /* Check for any matches. */
233  acl_match_check_avx2x8(ctx, parms, &flows, 0, &tr_lo[0], &tr_hi[0], ymm_match_mask.y);
234  acl_match_check_avx2x8(ctx, parms, &flows, 8, &tr_lo[1], &tr_hi[1], ymm_match_mask.y);
235 
236  while (flows.started > 0) {
237 
238  uint32_t in[MAX_SEARCHES_SSE8];
239 
240  /* Gather 4 bytes of input data for first 8 flows. */
241  in[0] = GET_NEXT_4BYTES(parms, 0);
242  in[4] = GET_NEXT_4BYTES(parms, 4);
243  in[1] = GET_NEXT_4BYTES(parms, 1);
244  in[5] = GET_NEXT_4BYTES(parms, 5);
245  in[2] = GET_NEXT_4BYTES(parms, 2);
246  in[6] = GET_NEXT_4BYTES(parms, 6);
247  in[3] = GET_NEXT_4BYTES(parms, 3);
248  in[7] = GET_NEXT_4BYTES(parms, 7);
249  input[0] = _mm256_set_epi32(in[7], in[6], in[5], in[4], in[3], in[2], in[1], in[0]);
250 
251  /* Gather 4 bytes of input data for last 8 flows. */
252  in[0] = GET_NEXT_4BYTES(parms, 8);
253  in[4] = GET_NEXT_4BYTES(parms, 12);
254  in[1] = GET_NEXT_4BYTES(parms, 9);
255  in[5] = GET_NEXT_4BYTES(parms, 13);
256  in[2] = GET_NEXT_4BYTES(parms, 10);
257  in[6] = GET_NEXT_4BYTES(parms, 14);
258  in[3] = GET_NEXT_4BYTES(parms, 11);
259  in[7] = GET_NEXT_4BYTES(parms, 15);
260  input[1] = _mm256_set_epi32(in[7], in[6], in[5], in[4], in[3], in[2], in[1], in[0]);
261 
262  input[0] = transition8(input[0], flows.trans, &tr_lo[0], &tr_hi[0]);
263  input[1] = transition8(input[1], flows.trans, &tr_lo[1], &tr_hi[1]);
264 
265  input[0] = transition8(input[0], flows.trans, &tr_lo[0], &tr_hi[0]);
266  input[1] = transition8(input[1], flows.trans, &tr_lo[1], &tr_hi[1]);
267 
268  input[0] = transition8(input[0], flows.trans, &tr_lo[0], &tr_hi[0]);
269  input[1] = transition8(input[1], flows.trans, &tr_lo[1], &tr_hi[1]);
270 
271  input[0] = transition8(input[0], flows.trans, &tr_lo[0], &tr_hi[0]);
272  input[1] = transition8(input[1], flows.trans, &tr_lo[1], &tr_hi[1]);
273 
274  /* Check for any matches. */
275  acl_match_check_avx2x8(ctx, parms, &flows, 0, &tr_lo[0], &tr_hi[0], ymm_match_mask.y);
276  acl_match_check_avx2x8(ctx, parms, &flows, 8, &tr_lo[1], &tr_hi[1], ymm_match_mask.y);
277  }
278 
279  return 0;
280 }
#define CNE_DIM(a)
Definition: cne_common.h:778
#define __cne_always_inline
Definition: cne_common.h:218