10 #include <emmintrin.h>
12 #include <smmintrin.h>
14 #include <tmmintrin.h>
15 #include <xmmintrin.h>
25 SHUFFLE32_SLOT1 = 0xe5,
26 SHUFFLE32_SLOT2 = 0xe6,
27 SHUFFLE32_SLOT3 = 0xe7,
28 SHUFFLE32_SWAP64 = 0x4e,
31 static const cne_xmm_t xmm_shuffle_input = {
32 .u32 = {0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c},
35 static const cne_xmm_t xmm_ones_16 = {
36 .u16 = {1, 1, 1, 1, 1, 1, 1, 1},
39 static const cne_xmm_t xmm_match_mask = {
49 static const cne_xmm_t xmm_index_mask = {
59 static const cne_xmm_t xmm_range_base = {
77 resolve_priority_sse(uint64_t transition,
int n,
const struct cne_acl_ctx *ctx,
struct parms *parms,
78 const struct cne_acl_match_results *p, uint32_t categories)
81 xmm_t results, priority, results1, priority1, selector;
82 xmm_t *saved_results, *saved_priority;
84 for (x = 0; x < categories; x += CNE_ACL_RESULTS_MULTIPLIER) {
86 saved_results = (xmm_t *)(&parms[n].cmplt->results[x]);
87 saved_priority = (xmm_t *)(&parms[n].cmplt->priority[x]);
90 results = _mm_loadu_si128((
const xmm_t *)&p[transition].results[x]);
91 priority = _mm_loadu_si128((
const xmm_t *)&p[transition].priority[x]);
94 if (parms[n].cmplt->count != ctx->num_tries) {
97 results1 = _mm_loadu_si128(saved_results);
98 priority1 = _mm_loadu_si128(saved_priority);
101 selector = _mm_cmpgt_epi32(priority1, priority);
102 results = _mm_blendv_epi8(results, results1, selector);
103 priority = _mm_blendv_epi8(priority, priority1, selector);
107 _mm_storeu_si128(saved_results, results);
108 _mm_storeu_si128(saved_priority, priority);
116 acl_process_matches(xmm_t *indices,
int slot,
const struct cne_acl_ctx *ctx,
struct parms *parms,
117 struct acl_flow_data *flows)
119 uint64_t transition1, transition2;
122 transition1 = _mm_cvtsi128_si64(*indices);
125 *indices = _mm_shuffle_epi32(*indices, SHUFFLE32_SWAP64);
126 transition2 = _mm_cvtsi128_si64(*indices);
128 transition1 = acl_match_check(transition1, slot, ctx, parms, flows, resolve_priority_sse);
129 transition2 = acl_match_check(transition2, slot + 1, ctx, parms, flows, resolve_priority_sse);
132 *indices = _mm_set_epi64x(transition2, transition1);
139 acl_match_check_x4(
int slot,
const struct cne_acl_ctx *ctx,
struct parms *parms,
140 struct acl_flow_data *flows, xmm_t *indices1, xmm_t *indices2, xmm_t match_mask)
145 temp = (xmm_t)_mm_shuffle_ps((__m128)*indices1, (__m128)*indices2, 0x88);
147 temp = _mm_and_si128(match_mask, temp);
149 while (!_mm_testz_si128(temp, temp)) {
150 acl_process_matches(indices1, slot, ctx, parms, flows);
151 acl_process_matches(indices2, slot + 2, ctx, parms, flows);
153 temp = (xmm_t)_mm_shuffle_ps((__m128)*indices1, (__m128)*indices2, 0x88);
154 temp = _mm_and_si128(match_mask, temp);
162 transition4(xmm_t next_input,
const uint64_t *trans, xmm_t *indices1, xmm_t *indices2)
164 xmm_t addr, tr_lo, tr_hi;
165 uint64_t trans0, trans2;
168 ACL_TR_HILO(mm, __m128, *indices1, *indices2, tr_lo, tr_hi);
171 ACL_TR_CALC_ADDR(mm, 128, addr, xmm_index_mask.x, next_input, xmm_shuffle_input.x,
172 xmm_ones_16.x, xmm_range_base.x, tr_lo, tr_hi);
176 trans0 = trans[_mm_cvtsi128_si32(addr)];
181 addr = _mm_shuffle_epi32(addr, SHUFFLE32_SLOT2);
182 trans2 = trans[_mm_cvtsi128_si32(addr)];
187 addr = _mm_shuffle_epi32(addr, SHUFFLE32_SLOT1);
188 *indices1 = _mm_set_epi64x(trans[_mm_cvtsi128_si32(addr)], trans0);
193 addr = _mm_shuffle_epi32(addr, SHUFFLE32_SLOT3);
194 *indices2 = _mm_set_epi64x(trans[_mm_cvtsi128_si32(addr)], trans2);
196 return _mm_srli_epi32(next_input, CHAR_BIT);
203 search_sse_8(
const struct cne_acl_ctx *ctx,
const uint8_t **data, uint32_t *results,
204 uint32_t total_packets, uint32_t categories)
207 struct acl_flow_data flows;
208 uint64_t index_array[MAX_SEARCHES_SSE8];
209 struct completion cmplt[MAX_SEARCHES_SSE8];
210 struct parms parms[MAX_SEARCHES_SSE8];
211 xmm_t input0, input1;
212 xmm_t indices1, indices2, indices3, indices4;
214 acl_set_flow(&flows, cmplt,
CNE_DIM(cmplt), data, results, total_packets, categories,
217 for (n = 0; n < MAX_SEARCHES_SSE8; n++) {
219 index_array[n] = acl_start_next_trie(&flows, parms, n, ctx);
229 indices1 = _mm_loadu_si128((xmm_t *)&index_array[0]);
230 indices2 = _mm_loadu_si128((xmm_t *)&index_array[2]);
232 indices3 = _mm_loadu_si128((xmm_t *)&index_array[4]);
233 indices4 = _mm_loadu_si128((xmm_t *)&index_array[6]);
236 acl_match_check_x4(0, ctx, parms, &flows, &indices1, &indices2, xmm_match_mask.x);
237 acl_match_check_x4(4, ctx, parms, &flows, &indices3, &indices4, xmm_match_mask.x);
239 while (flows.started > 0) {
242 input0 = _mm_cvtsi32_si128(GET_NEXT_4BYTES(parms, 0));
243 input1 = _mm_cvtsi32_si128(GET_NEXT_4BYTES(parms, 4));
245 input0 = _mm_insert_epi32(input0, GET_NEXT_4BYTES(parms, 1), 1);
246 input1 = _mm_insert_epi32(input1, GET_NEXT_4BYTES(parms, 5), 1);
248 input0 = _mm_insert_epi32(input0, GET_NEXT_4BYTES(parms, 2), 2);
249 input1 = _mm_insert_epi32(input1, GET_NEXT_4BYTES(parms, 6), 2);
251 input0 = _mm_insert_epi32(input0, GET_NEXT_4BYTES(parms, 3), 3);
252 input1 = _mm_insert_epi32(input1, GET_NEXT_4BYTES(parms, 7), 3);
256 input0 = transition4(input0, flows.trans, &indices1, &indices2);
257 input1 = transition4(input1, flows.trans, &indices3, &indices4);
259 input0 = transition4(input0, flows.trans, &indices1, &indices2);
260 input1 = transition4(input1, flows.trans, &indices3, &indices4);
262 input0 = transition4(input0, flows.trans, &indices1, &indices2);
263 input1 = transition4(input1, flows.trans, &indices3, &indices4);
265 input0 = transition4(input0, flows.trans, &indices1, &indices2);
266 input1 = transition4(input1, flows.trans, &indices3, &indices4);
269 acl_match_check_x4(0, ctx, parms, &flows, &indices1, &indices2, xmm_match_mask.x);
270 acl_match_check_x4(4, ctx, parms, &flows, &indices3, &indices4, xmm_match_mask.x);
280 search_sse_4(
const struct cne_acl_ctx *ctx,
const uint8_t **data, uint32_t *results,
281 int total_packets, uint32_t categories)
284 struct acl_flow_data flows;
285 uint64_t index_array[MAX_SEARCHES_SSE4];
286 struct completion cmplt[MAX_SEARCHES_SSE4];
287 struct parms parms[MAX_SEARCHES_SSE4];
288 xmm_t input, indices1, indices2;
290 acl_set_flow(&flows, cmplt,
CNE_DIM(cmplt), data, results, total_packets, categories,
293 for (n = 0; n < MAX_SEARCHES_SSE4; n++) {
295 index_array[n] = acl_start_next_trie(&flows, parms, n, ctx);
298 indices1 = _mm_loadu_si128((xmm_t *)&index_array[0]);
299 indices2 = _mm_loadu_si128((xmm_t *)&index_array[2]);
302 acl_match_check_x4(0, ctx, parms, &flows, &indices1, &indices2, xmm_match_mask.x);
304 while (flows.started > 0) {
307 input = _mm_cvtsi32_si128(GET_NEXT_4BYTES(parms, 0));
308 input = _mm_insert_epi32(input, GET_NEXT_4BYTES(parms, 1), 1);
309 input = _mm_insert_epi32(input, GET_NEXT_4BYTES(parms, 2), 2);
310 input = _mm_insert_epi32(input, GET_NEXT_4BYTES(parms, 3), 3);
313 input = transition4(input, flows.trans, &indices1, &indices2);
314 input = transition4(input, flows.trans, &indices1, &indices2);
315 input = transition4(input, flows.trans, &indices1, &indices2);
316 input = transition4(input, flows.trans, &indices1, &indices2);
319 acl_match_check_x4(0, ctx, parms, &flows, &indices1, &indices2, xmm_match_mask.x);
#define __cne_always_inline