10 #include <emmintrin.h>
11 #include <immintrin.h>
22 static const cne_ymm_t ymm_match_mask = {
36 static const cne_ymm_t ymm_index_mask = {
50 static const cne_ymm_t ymm_shuffle_input = {
64 static const cne_ymm_t ymm_ones_16 = {
86 static const cne_ymm_t ymm_range_base = {
107 transition8(ymm_t next_input,
const uint64_t *trans, ymm_t *tr_lo, ymm_t *tr_hi)
112 tr = (
const int32_t *)(uintptr_t)trans;
115 ACL_TR_CALC_ADDR(mm256, 256, addr, ymm_index_mask.y, next_input, ymm_shuffle_input.y,
116 ymm_ones_16.y, ymm_range_base.y, *tr_lo, *tr_hi);
119 *tr_lo = _mm256_i32gather_epi32(tr, addr,
sizeof(trans[0]));
121 next_input = _mm256_srli_epi32(next_input, CHAR_BIT);
124 *tr_hi = _mm256_i32gather_epi32(tr + 1, addr,
sizeof(trans[0]));
135 acl_process_matches_avx2x8(
const struct cne_acl_ctx *ctx,
struct parms *parms,
136 struct acl_flow_data *flows, uint32_t slot, ymm_t matches, ymm_t *tr_lo,
143 uint64_t tr[MAX_SEARCHES_SSE8];
145 l1 = _mm256_extracti128_si256(*tr_lo, 1);
146 l0 = _mm256_castsi256_si128(*tr_lo);
148 for (i = 0; i !=
CNE_DIM(tr) / 2; i++) {
154 tr[i] = (uint32_t)_mm_cvtsi128_si32(l0);
155 tr[i + 4] = (uint32_t)_mm_cvtsi128_si32(l1);
157 l0 = _mm_srli_si128(l0,
sizeof(uint32_t));
158 l1 = _mm_srli_si128(l1,
sizeof(uint32_t));
160 tr[i] = acl_match_check(tr[i], slot + i, ctx, parms, flows, resolve_priority_sse);
162 acl_match_check(tr[i + 4], slot + i + 4, ctx, parms, flows, resolve_priority_sse);
166 t0 = _mm256_set_epi64x(tr[5], tr[4], tr[1], tr[0]);
167 t1 = _mm256_set_epi64x(tr[7], tr[6], tr[3], tr[2]);
170 ACL_TR_HILO(mm256, __m256, t0, t1, lo, hi);
173 *tr_lo = _mm256_blendv_epi8(*tr_lo, lo, matches);
174 *tr_hi = _mm256_blendv_epi8(*tr_hi, hi, matches);
178 acl_match_check_avx2x8(
const struct cne_acl_ctx *ctx,
struct parms *parms,
179 struct acl_flow_data *flows, uint32_t slot, ymm_t *tr_lo, ymm_t *tr_hi,
186 temp = _mm256_and_si256(match_mask, *tr_lo);
187 matches = _mm256_cmpeq_epi32(temp, match_mask);
188 msk = _mm256_movemask_epi8(matches);
192 acl_process_matches_avx2x8(ctx, parms, flows, slot, matches, tr_lo, tr_hi);
193 temp = _mm256_and_si256(match_mask, *tr_lo);
194 matches = _mm256_cmpeq_epi32(temp, match_mask);
195 msk = _mm256_movemask_epi8(matches);
203 search_avx2x16(
const struct cne_acl_ctx *ctx,
const uint8_t **data, uint32_t *results,
204 uint32_t total_packets, uint32_t categories)
207 struct acl_flow_data flows;
208 uint64_t index_array[MAX_SEARCHES_AVX16];
209 struct completion cmplt[MAX_SEARCHES_AVX16];
210 struct parms parms[MAX_SEARCHES_AVX16];
211 ymm_t input[2], tr_lo[2], tr_hi[2];
214 acl_set_flow(&flows, cmplt,
CNE_DIM(cmplt), data, results, total_packets, categories,
217 for (n = 0; n <
CNE_DIM(cmplt); n++) {
219 index_array[n] = acl_start_next_trie(&flows, parms, n, ctx);
222 t0 = _mm256_set_epi64x(index_array[5], index_array[4], index_array[1], index_array[0]);
223 t1 = _mm256_set_epi64x(index_array[7], index_array[6], index_array[3], index_array[2]);
225 ACL_TR_HILO(mm256, __m256, t0, t1, tr_lo[0], tr_hi[0]);
227 t0 = _mm256_set_epi64x(index_array[13], index_array[12], index_array[9], index_array[8]);
228 t1 = _mm256_set_epi64x(index_array[15], index_array[14], index_array[11], index_array[10]);
230 ACL_TR_HILO(mm256, __m256, t0, t1, tr_lo[1], tr_hi[1]);
233 acl_match_check_avx2x8(ctx, parms, &flows, 0, &tr_lo[0], &tr_hi[0], ymm_match_mask.y);
234 acl_match_check_avx2x8(ctx, parms, &flows, 8, &tr_lo[1], &tr_hi[1], ymm_match_mask.y);
236 while (flows.started > 0) {
238 uint32_t in[MAX_SEARCHES_SSE8];
241 in[0] = GET_NEXT_4BYTES(parms, 0);
242 in[4] = GET_NEXT_4BYTES(parms, 4);
243 in[1] = GET_NEXT_4BYTES(parms, 1);
244 in[5] = GET_NEXT_4BYTES(parms, 5);
245 in[2] = GET_NEXT_4BYTES(parms, 2);
246 in[6] = GET_NEXT_4BYTES(parms, 6);
247 in[3] = GET_NEXT_4BYTES(parms, 3);
248 in[7] = GET_NEXT_4BYTES(parms, 7);
249 input[0] = _mm256_set_epi32(in[7], in[6], in[5], in[4], in[3], in[2], in[1], in[0]);
252 in[0] = GET_NEXT_4BYTES(parms, 8);
253 in[4] = GET_NEXT_4BYTES(parms, 12);
254 in[1] = GET_NEXT_4BYTES(parms, 9);
255 in[5] = GET_NEXT_4BYTES(parms, 13);
256 in[2] = GET_NEXT_4BYTES(parms, 10);
257 in[6] = GET_NEXT_4BYTES(parms, 14);
258 in[3] = GET_NEXT_4BYTES(parms, 11);
259 in[7] = GET_NEXT_4BYTES(parms, 15);
260 input[1] = _mm256_set_epi32(in[7], in[6], in[5], in[4], in[3], in[2], in[1], in[0]);
262 input[0] = transition8(input[0], flows.trans, &tr_lo[0], &tr_hi[0]);
263 input[1] = transition8(input[1], flows.trans, &tr_lo[1], &tr_hi[1]);
265 input[0] = transition8(input[0], flows.trans, &tr_lo[0], &tr_hi[0]);
266 input[1] = transition8(input[1], flows.trans, &tr_lo[1], &tr_hi[1]);
268 input[0] = transition8(input[0], flows.trans, &tr_lo[0], &tr_hi[0]);
269 input[1] = transition8(input[1], flows.trans, &tr_lo[1], &tr_hi[1]);
271 input[0] = transition8(input[0], flows.trans, &tr_lo[0], &tr_hi[0]);
272 input[1] = transition8(input[1], flows.trans, &tr_lo[1], &tr_hi[1]);
275 acl_match_check_avx2x8(ctx, parms, &flows, 0, &tr_lo[0], &tr_hi[0], ymm_match_mask.y);
276 acl_match_check_avx2x8(ctx, parms, &flows, 8, &tr_lo[1], &tr_hi[1], ymm_match_mask.y);
#define __cne_always_inline