api/acl__run__avx512x16_8h_source.html

 /* SPDX-License-Identifier: BSD-3-Clause

  * Copyright (c) 2020-2022 Intel Corporation

  */


 /*

  * Defines required by "acl_run_avx512_common.h".

  * Note that all of them has to be undefined by the end

  * of this file, as "acl_run_avx512_common.h" can be included several

  * times from different *.h files for the same *.c.

  */


 /*

  * This implementation uses 512-bit registers(zmm) and instrincts.

  * So our main SIMD type is 512-bit width and each such variable can

  * process sizeof(__m512i) / sizeof(uint32_t) == 16 entries in parallel.

  */

 #define _T_simd __m512i

 #define _T_mask __mmask16


 /* Naming convention for static const variables. */

 #define _SC_(x) zmm_##x

 #define _SV_(x) (zmm_##x.z)


 /* Naming convention for internal functions. */

 #define _F_(x) x##_avx512x16


 /*

  * Same instrincts have different syntaxis (depending on the bit-width),

  * so to overcome that few macros need to be defined.

  */


 /* Naming convention for generic epi(packed integers) type instrincts. */

 #define _M_I_(x) _mm512_##x


 /* Naming convention for si(whole simd integer) type instrincts. */

 #define _M_SI_(x) _mm512_##x##_si512


 /* Naming convention for masked gather type instrincts. */

 #define _M_MGI_(x) _mm512_##x


 /* Naming convention for gather type instrincts. */

 #define _M_GI_(name, idx, base, scale) _mm512_##name(idx, base, scale)


 /* num/mask of transitions per SIMD regs */

 #define _SIMD_MASK_BIT_ (sizeof(_T_simd) / sizeof(uint32_t))

 #define _SIMD_MASK_MAX_ CNE_LEN2MASK(_SIMD_MASK_BIT_, uint32_t)


 #define _SIMD_FLOW_NUM_ (2 * _SIMD_MASK_BIT_)

 #define _SIMD_FLOW_MSK_ (_SIMD_FLOW_NUM_ - 1)


 /* num/mask of pointers per SIMD regs */

 #define _SIMD_PTR_NUM_ (sizeof(_T_simd) / sizeof(uintptr_t))

 #define _SIMD_PTR_MSK_ CNE_LEN2MASK(_SIMD_PTR_NUM_, uint32_t)


 static const __cne_x86_zmm_t _SC_(match_mask) = {

     .u32 =

         {

             CNE_ACL_NODE_MATCH,

             CNE_ACL_NODE_MATCH,

             CNE_ACL_NODE_MATCH,

             CNE_ACL_NODE_MATCH,

             CNE_ACL_NODE_MATCH,

             CNE_ACL_NODE_MATCH,

             CNE_ACL_NODE_MATCH,

             CNE_ACL_NODE_MATCH,

             CNE_ACL_NODE_MATCH,

             CNE_ACL_NODE_MATCH,

             CNE_ACL_NODE_MATCH,

             CNE_ACL_NODE_MATCH,

             CNE_ACL_NODE_MATCH,

             CNE_ACL_NODE_MATCH,

             CNE_ACL_NODE_MATCH,

             CNE_ACL_NODE_MATCH,

         },

 };


 static const __cne_x86_zmm_t _SC_(index_mask) = {

     .u32 =

         {

             CNE_ACL_NODE_INDEX,

             CNE_ACL_NODE_INDEX,

             CNE_ACL_NODE_INDEX,

             CNE_ACL_NODE_INDEX,

             CNE_ACL_NODE_INDEX,

             CNE_ACL_NODE_INDEX,

             CNE_ACL_NODE_INDEX,

             CNE_ACL_NODE_INDEX,

             CNE_ACL_NODE_INDEX,

             CNE_ACL_NODE_INDEX,

             CNE_ACL_NODE_INDEX,

             CNE_ACL_NODE_INDEX,

             CNE_ACL_NODE_INDEX,

             CNE_ACL_NODE_INDEX,

             CNE_ACL_NODE_INDEX,

             CNE_ACL_NODE_INDEX,

         },

 };


 static const __cne_x86_zmm_t _SC_(trlo_idle) = {

     .u32 =

         {

             CNE_ACL_IDLE_NODE,

             CNE_ACL_IDLE_NODE,

             CNE_ACL_IDLE_NODE,

             CNE_ACL_IDLE_NODE,

             CNE_ACL_IDLE_NODE,

             CNE_ACL_IDLE_NODE,

             CNE_ACL_IDLE_NODE,

             CNE_ACL_IDLE_NODE,

             CNE_ACL_IDLE_NODE,

             CNE_ACL_IDLE_NODE,

             CNE_ACL_IDLE_NODE,

             CNE_ACL_IDLE_NODE,

             CNE_ACL_IDLE_NODE,

             CNE_ACL_IDLE_NODE,

             CNE_ACL_IDLE_NODE,

             CNE_ACL_IDLE_NODE,

         },

 };


 // clang-format off

 static const __cne_x86_zmm_t _SC_(trhi_idle) = {

     .u32 = {

         0, 0, 0, 0,

         0, 0, 0, 0,

         0, 0, 0, 0,

         0, 0, 0, 0,

     },

 };


 static const __cne_x86_zmm_t _SC_(shuffle_input) = {

     .u32 = {

         0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,

         0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,

         0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,

         0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,

     },

 };


 static const __cne_x86_zmm_t _SC_(four_32) = {

     .u32 = {

         4, 4, 4, 4,

         4, 4, 4, 4,

         4, 4, 4, 4,

         4, 4, 4, 4,

     },

 };


 static const __cne_x86_zmm_t _SC_(idx_add) = {

     .u32 = {

         0, 1, 2, 3,

         4, 5, 6, 7,

         8, 9, 10, 11,

         12, 13, 14, 15,

     },

 };


 static const __cne_x86_zmm_t _SC_(range_base) = {

     .u32 = {

         0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,

         0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,

         0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,

         0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,

     },

 };


 static const __cne_x86_zmm_t _SC_(pminp) = {

     .u32 = {

         0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,

         0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,

     },

 };


 static const _T_mask _SC_(pmidx_msk) = 0x5555;


 static const __cne_x86_zmm_t _SC_(pmidx[2]) = {

     [0] = {

         .u32 = {

             0, 0, 1, 0, 2, 0, 3, 0,

             4, 0, 5, 0, 6, 0, 7, 0,

         },

     },

     [1] = {

         .u32 = {

             8, 0, 9, 0, 10, 0, 11, 0,

             12, 0, 13, 0, 14, 0, 15, 0,

         },

     },

 };

 // clang-format on

 /*

  * unfortunately current AVX512 ISA doesn't provide ability for

  * gather load on a byte quantity. So we have to mimic it in SW,

  * by doing 8x1B scalar loads.

  */

 static inline __m256i

 _m512_mask_gather_epi8x8(__m512i pdata, __mmask8 mask)

 {

     cne_ymm_t v;

     __cne_x86_zmm_t p;


     static const uint32_t zero;


     p.z = _mm512_mask_set1_epi64(pdata, mask ^ _SIMD_PTR_MSK_, (uintptr_t)&zero);


     v.u32[0] = *(uint8_t *)p.u64[0];

     v.u32[1] = *(uint8_t *)p.u64[1];

     v.u32[2] = *(uint8_t *)p.u64[2];

     v.u32[3] = *(uint8_t *)p.u64[3];

     v.u32[4] = *(uint8_t *)p.u64[4];

     v.u32[5] = *(uint8_t *)p.u64[5];

     v.u32[6] = *(uint8_t *)p.u64[6];

     v.u32[7] = *(uint8_t *)p.u64[7];


     return v.y;

 }


 /*

  * Gather 4/1 input bytes for up to 16 (2*8) locations in parallel.

  */

 static __cne_always_inline __m512i

 _F_(gather_bytes)(__m512i zero, const __m512i p[2], const uint32_t m[2], uint32_t bnum)

 {

     __m256i inp[2];


     if (bnum == sizeof(uint8_t)) {

         inp[0] = _m512_mask_gather_epi8x8(p[0], m[0]);

         inp[1] = _m512_mask_gather_epi8x8(p[1], m[1]);

     } else {

         inp[0] = _mm512_mask_i64gather_epi32(_mm512_castsi512_si256(zero), m[0], p[0], NULL,

                                              sizeof(uint8_t));

         inp[1] = _mm512_mask_i64gather_epi32(_mm512_castsi512_si256(zero), m[1], p[1], NULL,

                                              sizeof(uint8_t));

     }


     /* squeeze input into one 512-bit register */

     return _mm512_permutex2var_epi32(_mm512_castsi256_si512(inp[0]), _SV_(pminp),

                                      _mm512_castsi256_si512(inp[1]));

 }


 /*

  * Resolve matches for multiple categories (GT 8, use 512b instuctions/regs)

  */

 static inline void

 resolve_mcgt8_avx512x1(uint32_t result[], const struct cne_acl_match_results pr[],

                        const uint32_t match[], uint32_t nb_pkt, uint32_t nb_cat, uint32_t nb_trie)

 {

     const int32_t *pri;

     const uint32_t *pm, *res;

     uint32_t i, k, mi;

     __mmask16 cm, sm;

     __m512i cp, cr, np, nr;


     const uint32_t match_log = 5;


     res = pr->results;

     pri = pr->priority;


     cm = (1 << nb_cat) - 1;


     for (k = 0; k != nb_pkt; k++, result += nb_cat) {


         mi = match[k] << match_log;


         cr = _mm512_maskz_loadu_epi32(cm, res + mi);

         cp = _mm512_maskz_loadu_epi32(cm, pri + mi);


         for (i = 1, pm = match + nb_pkt; i != nb_trie; i++, pm += nb_pkt) {


             mi = pm[k] << match_log;


             nr = _mm512_maskz_loadu_epi32(cm, res + mi);

             np = _mm512_maskz_loadu_epi32(cm, pri + mi);


             sm = _mm512_cmpgt_epi32_mask(cp, np);

             cr = _mm512_mask_mov_epi32(nr, sm, cr);

             cp = _mm512_mask_mov_epi32(np, sm, cp);

         }


         _mm512_mask_storeu_epi32(result, cm, cr);

     }

 }


 #include "acl_run_avx512_common.h"


 /*

  * Perform search for up to (2 * 16) flows in parallel.

  * Use two sets of metadata, each serves 16 flows max.

  */

 static inline int

 search_avx512x16x2(const struct cne_acl_ctx *ctx, const uint8_t **data, uint32_t *results,

                    uint32_t total_packets, uint32_t categories)

 {

     uint32_t i, *pm;

     const struct cne_acl_match_results *pr;

     struct acl_flow_avx512 flow;

     uint32_t match[ctx->num_tries * total_packets];


     for (i = 0, pm = match; i != ctx->num_tries; i++, pm += total_packets) {


         /* setup for next trie */

         acl_set_flow_avx512(&flow, ctx, i, data, pm, total_packets);


         /* process the trie */

         _F_(search_trie)(&flow);

     }


     /* resolve matches */

     pr = (const struct cne_acl_match_results *)(ctx->trans_table + ctx->match_index);


     if (categories == 1)

         _F_(resolve_single_cat)(results, pr, match, total_packets, ctx->num_tries);

     else if (categories <= CNE_ACL_MAX_CATEGORIES / 2)

         resolve_mcle8_avx512x1(results, pr, match, total_packets, categories, ctx->num_tries);

     else

         resolve_mcgt8_avx512x1(results, pr, match, total_packets, categories, ctx->num_tries);


     return 0;

 }


 #undef _SIMD_PTR_MSK_

 #undef _SIMD_PTR_NUM_

 #undef _SIMD_FLOW_MSK_

 #undef _SIMD_FLOW_NUM_

 #undef _SIMD_MASK_MAX_

 #undef _SIMD_MASK_BIT_

 #undef _M_GI_

 #undef _M_MGI_

 #undef _M_SI_

 #undef _M_I_

 #undef _F_

 #undef _SV_

 #undef _SC_

 #undef _T_mask

 #undef _T_simd

__cne_always_inline
#define __cne_always_inline
Definition: cne_common.h:218