25#ifndef _GLIBCXX_EXPERIMENTAL_SIMD_NEON_H_
26#define _GLIBCXX_EXPERIMENTAL_SIMD_NEON_H_
28#if __cplusplus >= 201703L
30#if !_GLIBCXX_SIMD_HAVE_NEON
31#error "simd_neon.h may only be included when NEON on ARM is available"
34_GLIBCXX_SIMD_BEGIN_NAMESPACE
37struct _CommonImplNeon : _CommonImplBuiltin
40 using _CommonImplBuiltin::_S_store;
47template <
typename _Abi,
typename>
48 struct _SimdImplNeon : _SimdImplBuiltin<_Abi>
50 using _Base = _SimdImplBuiltin<_Abi>;
52 template <
typename _Tp>
53 using _MaskMember =
typename _Base::template _MaskMember<_Tp>;
55 template <
typename _Tp>
56 static constexpr size_t _S_max_store_size = 16;
59 template <
typename _Tp,
size_t _Np,
typename _Up>
60 static inline _SimdWrapper<_Tp, _Np>
61 _S_masked_load(_SimdWrapper<_Tp, _Np> __merge, _MaskMember<_Tp> __k,
62 const _Up* __mem)
noexcept
64 __execute_n_times<_Np>([&](
auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
66 __merge._M_set(__i,
static_cast<_Tp
>(__mem[__i]));
73 template <
typename _Tp,
size_t _Np>
74 _GLIBCXX_SIMD_INTRINSIC
static void
75 _S_masked_store_nocvt(_SimdWrapper<_Tp, _Np> __v, _Tp* __mem,
78 __execute_n_times<_Np>([&](
auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
80 __mem[__i] = __v[__i];
86 template <
typename _Tp,
typename _BinaryOperation>
87 _GLIBCXX_SIMD_INTRINSIC
static _Tp
88 _S_reduce(simd<_Tp, _Abi> __x, _BinaryOperation&& __binary_op)
90 constexpr size_t _Np = __x.size();
91 if constexpr (
sizeof(__x) == 16 && _Np >= 4
92 && !_Abi::template _S_is_partial<_Tp>)
94 const auto __halves = split<simd<_Tp, simd_abi::_Neon<8>>>(__x);
95 const auto __y = __binary_op(__halves[0], __halves[1]);
96 return _SimdImplNeon<simd_abi::_Neon<8>>::_S_reduce(
97 __y,
static_cast<_BinaryOperation&&
>(__binary_op));
99 else if constexpr (_Np == 8)
101 __x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
102 __vector_permute<1, 0, 3, 2, 5, 4, 7, 6>(
104 __x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
105 __vector_permute<3, 2, 1, 0, 7, 6, 5, 4>(
107 __x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
108 __vector_permute<7, 6, 5, 4, 3, 2, 1, 0>(
112 else if constexpr (_Np == 4)
115 = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
116 __vector_permute<1, 0, 3, 2>(__x._M_data)));
118 = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
119 __vector_permute<3, 2, 1, 0>(__x._M_data)));
122 else if constexpr (_Np == 2)
124 __x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
125 __vector_permute<1, 0>(__x._M_data)));
129 return _Base::_S_reduce(__x,
130 static_cast<_BinaryOperation&&
>(__binary_op));
136 template <
typename _Tp,
typename _TVT = _VectorTraits<_Tp>>
137 _GLIBCXX_SIMD_INTRINSIC
static _Tp
140 if constexpr (__have_neon_a64)
142 const auto __intrin = __to_intrin(__x);
143 if constexpr (_TVT::template _S_is<float, 2>)
144 return vsqrt_f32(__intrin);
145 else if constexpr (_TVT::template _S_is<float, 4>)
146 return vsqrtq_f32(__intrin);
147 else if constexpr (_TVT::template _S_is<double, 1>)
148 return vsqrt_f64(__intrin);
149 else if constexpr (_TVT::template _S_is<double, 2>)
150 return vsqrtq_f64(__intrin);
152 __assert_unreachable<_Tp>();
155 return _Base::_S_sqrt(__x);
160 template <
typename _TW,
typename _TVT = _VectorTraits<_TW>>
161 _GLIBCXX_SIMD_INTRINSIC
static _TW
164 using _Tp =
typename _TVT::value_type;
165 if constexpr (__have_neon_a32)
167 const auto __intrin = __to_intrin(__x);
168 if constexpr (_TVT::template _S_is<float, 2>)
169 return vrnd_f32(__intrin);
170 else if constexpr (_TVT::template _S_is<float, 4>)
171 return vrndq_f32(__intrin);
172 else if constexpr (_TVT::template _S_is<double, 1>)
173 return vrnd_f64(__intrin);
174 else if constexpr (_TVT::template _S_is<double, 2>)
175 return vrndq_f64(__intrin);
177 __assert_unreachable<_Tp>();
179 else if constexpr (is_same_v<_Tp, float>)
181 auto __intrin = __to_intrin(__x);
182 if constexpr (
sizeof(__x) == 16)
183 __intrin = vcvtq_f32_s32(vcvtq_s32_f32(__intrin));
185 __intrin = vcvt_f32_s32(vcvt_s32_f32(__intrin));
186 return _Base::_S_abs(__x)._M_data < 0x1p23f
187 ? __vector_bitcast<float>(__intrin)
191 return _Base::_S_trunc(__x);
196 template <
typename _Tp,
size_t _Np>
197 _GLIBCXX_SIMD_INTRINSIC
static _SimdWrapper<_Tp, _Np>
198 _S_round(_SimdWrapper<_Tp, _Np> __x)
200 if constexpr (__have_neon_a32)
202 const auto __intrin = __to_intrin(__x);
203 if constexpr (
sizeof(_Tp) == 4 &&
sizeof(__x) == 8)
204 return vrnda_f32(__intrin);
205 else if constexpr (
sizeof(_Tp) == 4 &&
sizeof(__x) == 16)
206 return vrndaq_f32(__intrin);
207 else if constexpr (
sizeof(_Tp) == 8 &&
sizeof(__x) == 8)
208 return vrnda_f64(__intrin);
209 else if constexpr (
sizeof(_Tp) == 8 &&
sizeof(__x) == 16)
210 return vrndaq_f64(__intrin);
212 __assert_unreachable<_Tp>();
215 return _Base::_S_round(__x);
220 template <
typename _Tp,
typename _TVT = _VectorTraits<_Tp>>
221 _GLIBCXX_SIMD_INTRINSIC
static _Tp
224 if constexpr (__have_neon_a32)
226 const auto __intrin = __to_intrin(__x);
227 if constexpr (_TVT::template _S_is<float, 2>)
228 return vrndm_f32(__intrin);
229 else if constexpr (_TVT::template _S_is<float, 4>)
230 return vrndmq_f32(__intrin);
231 else if constexpr (_TVT::template _S_is<double, 1>)
232 return vrndm_f64(__intrin);
233 else if constexpr (_TVT::template _S_is<double, 2>)
234 return vrndmq_f64(__intrin);
236 __assert_unreachable<_Tp>();
239 return _Base::_S_floor(__x);
244 template <
typename _Tp,
typename _TVT = _VectorTraits<_Tp>>
245 _GLIBCXX_SIMD_INTRINSIC
static _Tp
248 if constexpr (__have_neon_a32)
250 const auto __intrin = __to_intrin(__x);
251 if constexpr (_TVT::template _S_is<float, 2>)
252 return vrndp_f32(__intrin);
253 else if constexpr (_TVT::template _S_is<float, 4>)
254 return vrndpq_f32(__intrin);
255 else if constexpr (_TVT::template _S_is<double, 1>)
256 return vrndp_f64(__intrin);
257 else if constexpr (_TVT::template _S_is<double, 2>)
258 return vrndpq_f64(__intrin);
260 __assert_unreachable<_Tp>();
263 return _Base::_S_ceil(__x);
269struct _MaskImplNeonMixin
271 using _Base = _MaskImplBuiltinMixin;
273 template <
typename _Tp,
size_t _Np>
274 _GLIBCXX_SIMD_INTRINSIC
static constexpr _SanitizedBitMask<_Np>
275 _S_to_bits(_SimdWrapper<_Tp, _Np> __x)
277 if (__builtin_is_constant_evaluated())
278 return _Base::_S_to_bits(__x);
280 using _I = __int_for_sizeof_t<_Tp>;
281 if constexpr (
sizeof(__x) == 16)
283 auto __asint = __vector_bitcast<_I>(__x);
285 [[maybe_unused]]
constexpr auto __zero =
decltype(__asint)();
287 [[maybe_unused]]
constexpr auto __zero =
decltype(__lo64(__asint))();
289 if constexpr (
sizeof(_Tp) == 1)
291 constexpr auto __bitsel
292 = __generate_from_n_evaluations<16, __vector_type_t<_I, 16>>(
293 [&](
auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
294 return static_cast<_I
>(
295 __i < _Np ? (__i < 8 ? 1 << __i : 1 << (__i - 8)) : 0);
299 return __vector_bitcast<_UShort>(
300 vpaddq_s8(vpaddq_s8(vpaddq_s8(__asint, __zero), __zero),
303 return __vector_bitcast<_UShort>(
304 vpadd_s8(vpadd_s8(vpadd_s8(__lo64(__asint), __hi64(__asint)),
309 else if constexpr (
sizeof(_Tp) == 2)
311 constexpr auto __bitsel
312 = __generate_from_n_evaluations<8, __vector_type_t<_I, 8>>(
313 [&](
auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
314 return static_cast<_I
>(__i < _Np ? 1 << __i : 0);
318 return vaddvq_s16(__asint);
321 vpadd_s16(vpadd_s16(__lo64(__asint), __hi64(__asint)), __zero),
325 else if constexpr (
sizeof(_Tp) == 4)
327 constexpr auto __bitsel
328 = __generate_from_n_evaluations<4, __vector_type_t<_I, 4>>(
329 [&](
auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
330 return static_cast<_I
>(__i < _Np ? 1 << __i : 0);
334 return vaddvq_s32(__asint);
336 return vpadd_s32(vpadd_s32(__lo64(__asint), __hi64(__asint)),
340 else if constexpr (
sizeof(_Tp) == 8)
341 return (__asint[0] & 1) | (__asint[1] & 2);
343 __assert_unreachable<_Tp>();
345 else if constexpr (
sizeof(__x) == 8)
347 auto __asint = __vector_bitcast<_I>(__x);
348 [[maybe_unused]]
constexpr auto __zero =
decltype(__asint)();
349 if constexpr (
sizeof(_Tp) == 1)
351 constexpr auto __bitsel
352 = __generate_from_n_evaluations<8, __vector_type_t<_I, 8>>(
353 [&](
auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
354 return static_cast<_I
>(__i < _Np ? 1 << __i : 0);
358 return vaddv_s8(__asint);
360 return vpadd_s8(vpadd_s8(vpadd_s8(__asint, __zero), __zero),
364 else if constexpr (
sizeof(_Tp) == 2)
366 constexpr auto __bitsel
367 = __generate_from_n_evaluations<4, __vector_type_t<_I, 4>>(
368 [&](
auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
369 return static_cast<_I
>(__i < _Np ? 1 << __i : 0);
373 return vaddv_s16(__asint);
375 return vpadd_s16(vpadd_s16(__asint, __zero), __zero)[0];
378 else if constexpr (
sizeof(_Tp) == 4)
380 __asint &= __make_vector<_I>(0x1, 0x2);
382 return vaddv_s32(__asint);
384 return vpadd_s32(__asint, __zero)[0];
388 __assert_unreachable<_Tp>();
391 return _Base::_S_to_bits(__x);
397template <
typename _Abi,
typename>
398 struct _MaskImplNeon : _MaskImplNeonMixin, _MaskImplBuiltin<_Abi>
400 using _MaskImplBuiltinMixin::_S_to_maskvector;
401 using _MaskImplNeonMixin::_S_to_bits;
402 using _Base = _MaskImplBuiltin<_Abi>;
403 using _Base::_S_convert;
406 template <
typename _Tp>
407 _GLIBCXX_SIMD_INTRINSIC
static bool
408 _S_all_of(simd_mask<_Tp, _Abi> __k)
411 = __vector_bitcast<char>(__k._M_data)
412 | ~__vector_bitcast<char>(_Abi::template _S_implicit_mask<_Tp>());
413 if constexpr (
sizeof(__k) == 16)
415 const auto __x = __vector_bitcast<long long>(__kk);
416 return __x[0] + __x[1] == -2;
418 else if constexpr (
sizeof(__k) <= 8)
419 return __bit_cast<__int_for_sizeof_t<
decltype(__kk)>>(__kk) == -1;
421 __assert_unreachable<_Tp>();
426 template <
typename _Tp>
427 _GLIBCXX_SIMD_INTRINSIC
static bool
428 _S_any_of(simd_mask<_Tp, _Abi> __k)
431 = __vector_bitcast<char>(__k._M_data)
432 | ~__vector_bitcast<char>(_Abi::template _S_implicit_mask<_Tp>());
433 if constexpr (
sizeof(__k) == 16)
435 const auto __x = __vector_bitcast<long long>(__kk);
436 return (__x[0] | __x[1]) != 0;
438 else if constexpr (
sizeof(__k) <= 8)
439 return __bit_cast<__int_for_sizeof_t<
decltype(__kk)>>(__kk) != 0;
441 __assert_unreachable<_Tp>();
446 template <
typename _Tp>
447 _GLIBCXX_SIMD_INTRINSIC
static bool
448 _S_none_of(simd_mask<_Tp, _Abi> __k)
450 const auto __kk = _Abi::_S_masked(__k._M_data);
451 if constexpr (
sizeof(__k) == 16)
453 const auto __x = __vector_bitcast<long long>(__kk);
454 return (__x[0] | __x[1]) == 0;
456 else if constexpr (
sizeof(__k) <= 8)
457 return __bit_cast<__int_for_sizeof_t<
decltype(__kk)>>(__kk) == 0;
459 __assert_unreachable<_Tp>();
464 template <
typename _Tp>
465 _GLIBCXX_SIMD_INTRINSIC
static bool _S_some_of(simd_mask<_Tp, _Abi> __k)
467 if constexpr (
sizeof(__k) <= 8)
469 const auto __kk = __vector_bitcast<char>(__k._M_data)
470 | ~__vector_bitcast<char>(
471 _Abi::template _S_implicit_mask<_Tp>());
473 return __bit_cast<_Up>(__kk) + 1 > 1;
476 return _Base::_S_some_of(__k);
481 template <
typename _Tp>
482 _GLIBCXX_SIMD_INTRINSIC
static int
483 _S_popcount(simd_mask<_Tp, _Abi> __k)
485 if constexpr (
sizeof(_Tp) == 1)
487 const auto __s8 = __vector_bitcast<_SChar>(__k._M_data);
488 int8x8_t __tmp = __lo64(__s8) + __hi64z(__s8);
489 return -vpadd_s8(vpadd_s8(vpadd_s8(__tmp, int8x8_t()), int8x8_t()),
492 else if constexpr (
sizeof(_Tp) == 2)
494 const auto __s16 = __vector_bitcast<short>(__k._M_data);
495 int16x4_t __tmp = __lo64(__s16) + __hi64z(__s16);
496 return -vpadd_s16(vpadd_s16(__tmp, int16x4_t()), int16x4_t())[0];
498 else if constexpr (
sizeof(_Tp) == 4)
500 const auto __s32 = __vector_bitcast<int>(__k._M_data);
501 int32x2_t __tmp = __lo64(__s32) + __hi64z(__s32);
502 return -vpadd_s32(__tmp, int32x2_t())[0];
504 else if constexpr (
sizeof(_Tp) == 8)
506 static_assert(
sizeof(__k) == 16);
507 const auto __s64 = __vector_bitcast<long>(__k._M_data);
508 return -(__s64[0] + __s64[1]);
514 template <
typename _Tp>
515 _GLIBCXX_SIMD_INTRINSIC
static int
516 _S_find_first_set(simd_mask<_Tp, _Abi> __k)
519 return _Base::_S_find_first_set(__k);
524 template <
typename _Tp>
525 _GLIBCXX_SIMD_INTRINSIC
static int
526 _S_find_last_set(simd_mask<_Tp, _Abi> __k)
529 return _Base::_S_find_last_set(__k);
535_GLIBCXX_SIMD_END_NAMESPACE
typename make_unsigned< _Tp >::type make_unsigned_t
Alias template for make_unsigned.