25#ifndef _GLIBCXX_EXPERIMENTAL_SIMD_NEON_H_ 
   26#define _GLIBCXX_EXPERIMENTAL_SIMD_NEON_H_ 
   28#if __cplusplus >= 201703L 
   30#if !_GLIBCXX_SIMD_HAVE_NEON 
   31#error "simd_neon.h may only be included when NEON on ARM is available" 
   34_GLIBCXX_SIMD_BEGIN_NAMESPACE
 
   37struct _CommonImplNeon : _CommonImplBuiltin
 
   40  using _CommonImplBuiltin::_S_store;
 
   47template <
typename _Abi, 
typename>
 
   48  struct _SimdImplNeon : _SimdImplBuiltin<_Abi>
 
   50    using _Base = _SimdImplBuiltin<_Abi>;
 
   52    template <
typename _Tp>
 
   53      using _MaskMember = 
typename _Base::template _MaskMember<_Tp>;
 
   55    template <
typename _Tp>
 
   56      static constexpr size_t _S_max_store_size = 16;
 
   59    template <
typename _Tp, 
size_t _Np, 
typename _Up>
 
   60      static inline _SimdWrapper<_Tp, _Np>
 
   61      _S_masked_load(_SimdWrapper<_Tp, _Np> __merge, _MaskMember<_Tp> __k,
 
   62                     const _Up* __mem) 
noexcept 
   64        __execute_n_times<_Np>([&](
auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
 
   66            __merge._M_set(__i, 
static_cast<_Tp
>(__mem[__i]));
 
   73    template <
typename _Tp, 
size_t _Np>
 
   74      _GLIBCXX_SIMD_INTRINSIC 
static void 
   75      _S_masked_store_nocvt(_SimdWrapper<_Tp, _Np> __v, _Tp* __mem,
 
   78        __execute_n_times<_Np>([&](
auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
 
   80            __mem[__i] = __v[__i];
 
   86    template <
typename _Tp, 
typename _BinaryOperation>
 
   87      _GLIBCXX_SIMD_INTRINSIC 
static _Tp
 
   88      _S_reduce(simd<_Tp, _Abi> __x, _BinaryOperation&& __binary_op)
 
   90        constexpr size_t _Np = __x.size();
 
   91        if constexpr (
sizeof(__x) == 16 && _Np >= 4
 
   92                      && !_Abi::template _S_is_partial<_Tp>)
 
   94            const auto __halves = split<simd<_Tp, simd_abi::_Neon<8>>>(__x);
 
   95            const auto __y = __binary_op(__halves[0], __halves[1]);
 
   96            return _SimdImplNeon<simd_abi::_Neon<8>>::_S_reduce(
 
   97              __y, 
static_cast<_BinaryOperation&&
>(__binary_op));
 
   99        else if constexpr (_Np == 8)
 
  101            __x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
 
  102                                     __vector_permute<1, 0, 3, 2, 5, 4, 7, 6>(
 
  104            __x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
 
  105                                     __vector_permute<3, 2, 1, 0, 7, 6, 5, 4>(
 
  107            __x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
 
  108                                     __vector_permute<7, 6, 5, 4, 3, 2, 1, 0>(
 
  112        else if constexpr (_Np == 4)
 
  115              = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
 
  116                                   __vector_permute<1, 0, 3, 2>(__x._M_data)));
 
  118              = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
 
  119                                   __vector_permute<3, 2, 1, 0>(__x._M_data)));
 
  122        else if constexpr (_Np == 2)
 
  124            __x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
 
  125                                     __vector_permute<1, 0>(__x._M_data)));
 
  129          return _Base::_S_reduce(__x,
 
  130                                  static_cast<_BinaryOperation&&
>(__binary_op));
 
  136    template <
typename _Tp, 
typename _TVT = _VectorTraits<_Tp>>
 
  137      _GLIBCXX_SIMD_INTRINSIC 
static _Tp
 
  140        if constexpr (__have_neon_a64)
 
  142            const auto __intrin = __to_intrin(__x);
 
  143            if constexpr (_TVT::template _S_is<float, 2>)
 
  144              return vsqrt_f32(__intrin);
 
  145            else if constexpr (_TVT::template _S_is<float, 4>)
 
  146              return vsqrtq_f32(__intrin);
 
  147            else if constexpr (_TVT::template _S_is<double, 1>)
 
  148              return vsqrt_f64(__intrin);
 
  149            else if constexpr (_TVT::template _S_is<double, 2>)
 
  150              return vsqrtq_f64(__intrin);
 
  152              __assert_unreachable<_Tp>();
 
  155          return _Base::_S_sqrt(__x);
 
  160    template <
typename _TW, 
typename _TVT = _VectorTraits<_TW>>
 
  161      _GLIBCXX_SIMD_INTRINSIC 
static _TW
 
  164        using _Tp = 
typename _TVT::value_type;
 
  165        if constexpr (__have_neon_a32)
 
  167            const auto __intrin = __to_intrin(__x);
 
  168            if constexpr (_TVT::template _S_is<float, 2>)
 
  169              return vrnd_f32(__intrin);
 
  170            else if constexpr (_TVT::template _S_is<float, 4>)
 
  171              return vrndq_f32(__intrin);
 
  172            else if constexpr (_TVT::template _S_is<double, 1>)
 
  173              return vrnd_f64(__intrin);
 
  174            else if constexpr (_TVT::template _S_is<double, 2>)
 
  175              return vrndq_f64(__intrin);
 
  177              __assert_unreachable<_Tp>();
 
  179        else if constexpr (is_same_v<_Tp, float>)
 
  181            auto __intrin = __to_intrin(__x);
 
  182            if constexpr (
sizeof(__x) == 16)
 
  183              __intrin = vcvtq_f32_s32(vcvtq_s32_f32(__intrin));
 
  185              __intrin = vcvt_f32_s32(vcvt_s32_f32(__intrin));
 
  186            return _Base::_S_abs(__x)._M_data < 0x1p23f
 
  187                     ? __vector_bitcast<float>(__intrin)
 
  191          return _Base::_S_trunc(__x);
 
  196    template <
typename _Tp, 
size_t _Np>
 
  197      _GLIBCXX_SIMD_INTRINSIC 
static _SimdWrapper<_Tp, _Np>
 
  198      _S_round(_SimdWrapper<_Tp, _Np> __x)
 
  200        if constexpr (__have_neon_a32)
 
  202            const auto __intrin = __to_intrin(__x);
 
  203            if constexpr (
sizeof(_Tp) == 4 && 
sizeof(__x) == 8)
 
  204              return vrnda_f32(__intrin);
 
  205            else if constexpr (
sizeof(_Tp) == 4 && 
sizeof(__x) == 16)
 
  206              return vrndaq_f32(__intrin);
 
  207            else if constexpr (
sizeof(_Tp) == 8 && 
sizeof(__x) == 8)
 
  208              return vrnda_f64(__intrin);
 
  209            else if constexpr (
sizeof(_Tp) == 8 && 
sizeof(__x) == 16)
 
  210              return vrndaq_f64(__intrin);
 
  212              __assert_unreachable<_Tp>();
 
  215          return _Base::_S_round(__x);
 
  220    template <
typename _Tp, 
typename _TVT = _VectorTraits<_Tp>>
 
  221      _GLIBCXX_SIMD_INTRINSIC 
static _Tp
 
  224        if constexpr (__have_neon_a32)
 
  226            const auto __intrin = __to_intrin(__x);
 
  227            if constexpr (_TVT::template _S_is<float, 2>)
 
  228              return vrndm_f32(__intrin);
 
  229            else if constexpr (_TVT::template _S_is<float, 4>)
 
  230              return vrndmq_f32(__intrin);
 
  231            else if constexpr (_TVT::template _S_is<double, 1>)
 
  232              return vrndm_f64(__intrin);
 
  233            else if constexpr (_TVT::template _S_is<double, 2>)
 
  234              return vrndmq_f64(__intrin);
 
  236              __assert_unreachable<_Tp>();
 
  239          return _Base::_S_floor(__x);
 
  244    template <
typename _Tp, 
typename _TVT = _VectorTraits<_Tp>>
 
  245      _GLIBCXX_SIMD_INTRINSIC 
static _Tp
 
  248        if constexpr (__have_neon_a32)
 
  250            const auto __intrin = __to_intrin(__x);
 
  251            if constexpr (_TVT::template _S_is<float, 2>)
 
  252              return vrndp_f32(__intrin);
 
  253            else if constexpr (_TVT::template _S_is<float, 4>)
 
  254              return vrndpq_f32(__intrin);
 
  255            else if constexpr (_TVT::template _S_is<double, 1>)
 
  256              return vrndp_f64(__intrin);
 
  257            else if constexpr (_TVT::template _S_is<double, 2>)
 
  258              return vrndpq_f64(__intrin);
 
  260              __assert_unreachable<_Tp>();
 
  263          return _Base::_S_ceil(__x);
 
  269struct _MaskImplNeonMixin
 
  271  using _Base = _MaskImplBuiltinMixin;
 
  273  template <
typename _Tp, 
size_t _Np>
 
  274    _GLIBCXX_SIMD_INTRINSIC 
static constexpr _SanitizedBitMask<_Np>
 
  275    _S_to_bits(_SimdWrapper<_Tp, _Np> __x)
 
  277      if (__builtin_is_constant_evaluated())
 
  278        return _Base::_S_to_bits(__x);
 
  280      using _I = __int_for_sizeof_t<_Tp>;
 
  281      if constexpr (
sizeof(__x) == 16)
 
  283          auto __asint = __vector_bitcast<_I>(__x);
 
  285          [[maybe_unused]] 
constexpr auto __zero = 
decltype(__asint)();
 
  287          [[maybe_unused]] 
constexpr auto __zero = 
decltype(__lo64(__asint))();
 
  289          if constexpr (
sizeof(_Tp) == 1)
 
  291              constexpr auto __bitsel
 
  292                = __generate_from_n_evaluations<16, __vector_type_t<_I, 16>>(
 
  293                  [&](
auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
 
  294                    return static_cast<_I
>(
 
  295                      __i < _Np ? (__i < 8 ? 1 << __i : 1 << (__i - 8)) : 0);
 
  299              return __vector_bitcast<_UShort>(
 
  300                vpaddq_s8(vpaddq_s8(vpaddq_s8(__asint, __zero), __zero),
 
  303              return __vector_bitcast<_UShort>(
 
  304                vpadd_s8(vpadd_s8(vpadd_s8(__lo64(__asint), __hi64(__asint)),
 
  309          else if constexpr (
sizeof(_Tp) == 2)
 
  311              constexpr auto __bitsel
 
  312                = __generate_from_n_evaluations<8, __vector_type_t<_I, 8>>(
 
  313                  [&](
auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
 
  314                    return static_cast<_I
>(__i < _Np ? 1 << __i : 0);
 
  318              return vaddvq_s16(__asint);
 
  321                vpadd_s16(vpadd_s16(__lo64(__asint), __hi64(__asint)), __zero),
 
  325          else if constexpr (
sizeof(_Tp) == 4)
 
  327              constexpr auto __bitsel
 
  328                = __generate_from_n_evaluations<4, __vector_type_t<_I, 4>>(
 
  329                  [&](
auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
 
  330                    return static_cast<_I
>(__i < _Np ? 1 << __i : 0);
 
  334              return vaddvq_s32(__asint);
 
  336              return vpadd_s32(vpadd_s32(__lo64(__asint), __hi64(__asint)),
 
  340          else if constexpr (
sizeof(_Tp) == 8)
 
  341            return (__asint[0] & 1) | (__asint[1] & 2);
 
  343            __assert_unreachable<_Tp>();
 
  345      else if constexpr (
sizeof(__x) == 8)
 
  347          auto __asint = __vector_bitcast<_I>(__x);
 
  348          [[maybe_unused]] 
constexpr auto __zero = 
decltype(__asint)();
 
  349          if constexpr (
sizeof(_Tp) == 1)
 
  351              constexpr auto __bitsel
 
  352                = __generate_from_n_evaluations<8, __vector_type_t<_I, 8>>(
 
  353                  [&](
auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
 
  354                    return static_cast<_I
>(__i < _Np ? 1 << __i : 0);
 
  358              return vaddv_s8(__asint);
 
  360              return vpadd_s8(vpadd_s8(vpadd_s8(__asint, __zero), __zero),
 
  364          else if constexpr (
sizeof(_Tp) == 2)
 
  366              constexpr auto __bitsel
 
  367                = __generate_from_n_evaluations<4, __vector_type_t<_I, 4>>(
 
  368                  [&](
auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
 
  369                    return static_cast<_I
>(__i < _Np ? 1 << __i : 0);
 
  373              return vaddv_s16(__asint);
 
  375              return vpadd_s16(vpadd_s16(__asint, __zero), __zero)[0];
 
  378          else if constexpr (
sizeof(_Tp) == 4)
 
  380              __asint &= __make_vector<_I>(0x1, 0x2);
 
  382              return vaddv_s32(__asint);
 
  384              return vpadd_s32(__asint, __zero)[0];
 
  388            __assert_unreachable<_Tp>();
 
  391        return _Base::_S_to_bits(__x);
 
  397template <
typename _Abi, 
typename>
 
  398  struct _MaskImplNeon : _MaskImplNeonMixin, _MaskImplBuiltin<_Abi>
 
  400    using _MaskImplBuiltinMixin::_S_to_maskvector;
 
  401    using _MaskImplNeonMixin::_S_to_bits;
 
  402    using _Base = _MaskImplBuiltin<_Abi>;
 
  403    using _Base::_S_convert;
 
  406    template <
typename _Tp>
 
  407      _GLIBCXX_SIMD_INTRINSIC 
static bool 
  408      _S_all_of(simd_mask<_Tp, _Abi> __k)
 
  411          = __vector_bitcast<char>(__k._M_data)
 
  412            | ~__vector_bitcast<char>(_Abi::template _S_implicit_mask<_Tp>());
 
  413        if constexpr (
sizeof(__k) == 16)
 
  415            const auto __x = __vector_bitcast<long long>(__kk);
 
  416            return __x[0] + __x[1] == -2;
 
  418        else if constexpr (
sizeof(__k) <= 8)
 
  419          return __bit_cast<__int_for_sizeof_t<
decltype(__kk)>>(__kk) == -1;
 
  421          __assert_unreachable<_Tp>();
 
  426    template <
typename _Tp>
 
  427      _GLIBCXX_SIMD_INTRINSIC 
static bool 
  428      _S_any_of(simd_mask<_Tp, _Abi> __k)
 
  431          = __vector_bitcast<char>(__k._M_data)
 
  432            | ~__vector_bitcast<char>(_Abi::template _S_implicit_mask<_Tp>());
 
  433        if constexpr (
sizeof(__k) == 16)
 
  435            const auto __x = __vector_bitcast<long long>(__kk);
 
  436            return (__x[0] | __x[1]) != 0;
 
  438        else if constexpr (
sizeof(__k) <= 8)
 
  439          return __bit_cast<__int_for_sizeof_t<
decltype(__kk)>>(__kk) != 0;
 
  441          __assert_unreachable<_Tp>();
 
  446    template <
typename _Tp>
 
  447      _GLIBCXX_SIMD_INTRINSIC 
static bool 
  448      _S_none_of(simd_mask<_Tp, _Abi> __k)
 
  450        const auto __kk = _Abi::_S_masked(__k._M_data);
 
  451        if constexpr (
sizeof(__k) == 16)
 
  453            const auto __x = __vector_bitcast<long long>(__kk);
 
  454            return (__x[0] | __x[1]) == 0;
 
  456        else if constexpr (
sizeof(__k) <= 8)
 
  457          return __bit_cast<__int_for_sizeof_t<
decltype(__kk)>>(__kk) == 0;
 
  459          __assert_unreachable<_Tp>();
 
  464    template <
typename _Tp>
 
  465      _GLIBCXX_SIMD_INTRINSIC 
static bool _S_some_of(simd_mask<_Tp, _Abi> __k)
 
  467        if constexpr (
sizeof(__k) <= 8)
 
  469            const auto __kk = __vector_bitcast<char>(__k._M_data)
 
  470                              | ~__vector_bitcast<char>(
 
  471                                _Abi::template _S_implicit_mask<_Tp>());
 
  473            return __bit_cast<_Up>(__kk) + 1 > 1;
 
  476          return _Base::_S_some_of(__k);
 
  481    template <
typename _Tp>
 
  482      _GLIBCXX_SIMD_INTRINSIC 
static int 
  483      _S_popcount(simd_mask<_Tp, _Abi> __k)
 
  485        if constexpr (
sizeof(_Tp) == 1)
 
  487            const auto __s8 = __vector_bitcast<_SChar>(__k._M_data);
 
  488            int8x8_t __tmp = __lo64(__s8) + __hi64z(__s8);
 
  489            return -vpadd_s8(vpadd_s8(vpadd_s8(__tmp, int8x8_t()), int8x8_t()),
 
  492        else if constexpr (
sizeof(_Tp) == 2)
 
  494            const auto __s16 = __vector_bitcast<short>(__k._M_data);
 
  495            int16x4_t __tmp = __lo64(__s16) + __hi64z(__s16);
 
  496            return -vpadd_s16(vpadd_s16(__tmp, int16x4_t()), int16x4_t())[0];
 
  498        else if constexpr (
sizeof(_Tp) == 4)
 
  500            const auto __s32 = __vector_bitcast<int>(__k._M_data);
 
  501            int32x2_t __tmp = __lo64(__s32) + __hi64z(__s32);
 
  502            return -vpadd_s32(__tmp, int32x2_t())[0];
 
  504        else if constexpr (
sizeof(_Tp) == 8)
 
  506            static_assert(
sizeof(__k) == 16);
 
  507            const auto __s64 = __vector_bitcast<long>(__k._M_data);
 
  508            return -(__s64[0] + __s64[1]);
 
  514    template <
typename _Tp>
 
  515      _GLIBCXX_SIMD_INTRINSIC 
static int 
  516      _S_find_first_set(simd_mask<_Tp, _Abi> __k)
 
  519        return _Base::_S_find_first_set(__k);
 
  524    template <
typename _Tp>
 
  525      _GLIBCXX_SIMD_INTRINSIC 
static int 
  526      _S_find_last_set(simd_mask<_Tp, _Abi> __k)
 
  529        return _Base::_S_find_last_set(__k);
 
  535_GLIBCXX_SIMD_END_NAMESPACE
 
typename make_unsigned< _Tp >::type make_unsigned_t
Alias template for make_unsigned.