libstdc++
simd_neon.h
1// Simd NEON specific implementations -*- C++ -*-
2
3// Copyright (C) 2020-2023 Free Software Foundation, Inc.
4//
5// This file is part of the GNU ISO C++ Library. This library is free
6// software; you can redistribute it and/or modify it under the
7// terms of the GNU General Public License as published by the
8// Free Software Foundation; either version 3, or (at your option)
9// any later version.
10
11// This library is distributed in the hope that it will be useful,
12// but WITHOUT ANY WARRANTY; without even the implied warranty of
13// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14// GNU General Public License for more details.
15
16// Under Section 7 of GPL version 3, you are granted additional
17// permissions described in the GCC Runtime Library Exception, version
18// 3.1, as published by the Free Software Foundation.
19
20// You should have received a copy of the GNU General Public License and
21// a copy of the GCC Runtime Library Exception along with this program;
22// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23// <http://www.gnu.org/licenses/>.
24
25#ifndef _GLIBCXX_EXPERIMENTAL_SIMD_NEON_H_
26#define _GLIBCXX_EXPERIMENTAL_SIMD_NEON_H_
27
28#if __cplusplus >= 201703L
29
30#if !_GLIBCXX_SIMD_HAVE_NEON
31#error "simd_neon.h may only be included when NEON on ARM is available"
32#endif
33
34_GLIBCXX_SIMD_BEGIN_NAMESPACE
35
36// _CommonImplNeon {{{
37struct _CommonImplNeon : _CommonImplBuiltin
38{
39 // _S_store {{{
40 using _CommonImplBuiltin::_S_store;
41
42 // }}}
43};
44
45// }}}
46// _SimdImplNeon {{{
47template <typename _Abi, typename>
48 struct _SimdImplNeon : _SimdImplBuiltin<_Abi>
49 {
50 using _Base = _SimdImplBuiltin<_Abi>;
51
52 template <typename _Tp>
53 using _MaskMember = typename _Base::template _MaskMember<_Tp>;
54
55 template <typename _Tp>
56 static constexpr size_t _S_max_store_size = 16;
57
58 // _S_masked_load {{{
59 template <typename _Tp, size_t _Np, typename _Up>
60 static inline _SimdWrapper<_Tp, _Np>
61 _S_masked_load(_SimdWrapper<_Tp, _Np> __merge, _MaskMember<_Tp> __k,
62 const _Up* __mem) noexcept
63 {
64 __execute_n_times<_Np>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
65 if (__k[__i] != 0)
66 __merge._M_set(__i, static_cast<_Tp>(__mem[__i]));
67 });
68 return __merge;
69 }
70
71 // }}}
72 // _S_masked_store_nocvt {{{
73 template <typename _Tp, size_t _Np>
74 _GLIBCXX_SIMD_INTRINSIC static void
75 _S_masked_store_nocvt(_SimdWrapper<_Tp, _Np> __v, _Tp* __mem,
76 _MaskMember<_Tp> __k)
77 {
78 __execute_n_times<_Np>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
79 if (__k[__i] != 0)
80 __mem[__i] = __v[__i];
81 });
82 }
83
84 // }}}
85 // _S_reduce {{{
86 template <typename _Tp, typename _BinaryOperation>
87 _GLIBCXX_SIMD_INTRINSIC static _Tp
88 _S_reduce(simd<_Tp, _Abi> __x, _BinaryOperation&& __binary_op)
89 {
90 constexpr size_t _Np = __x.size();
91 if constexpr (sizeof(__x) == 16 && _Np >= 4
92 && !_Abi::template _S_is_partial<_Tp>)
93 {
94 const auto __halves = split<simd<_Tp, simd_abi::_Neon<8>>>(__x);
95 const auto __y = __binary_op(__halves[0], __halves[1]);
96 return _SimdImplNeon<simd_abi::_Neon<8>>::_S_reduce(
97 __y, static_cast<_BinaryOperation&&>(__binary_op));
98 }
99 else if constexpr (_Np == 8)
100 {
101 __x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
102 __vector_permute<1, 0, 3, 2, 5, 4, 7, 6>(
103 __x._M_data)));
104 __x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
105 __vector_permute<3, 2, 1, 0, 7, 6, 5, 4>(
106 __x._M_data)));
107 __x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
108 __vector_permute<7, 6, 5, 4, 3, 2, 1, 0>(
109 __x._M_data)));
110 return __x[0];
111 }
112 else if constexpr (_Np == 4)
113 {
114 __x
115 = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
116 __vector_permute<1, 0, 3, 2>(__x._M_data)));
117 __x
118 = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
119 __vector_permute<3, 2, 1, 0>(__x._M_data)));
120 return __x[0];
121 }
122 else if constexpr (_Np == 2)
123 {
124 __x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
125 __vector_permute<1, 0>(__x._M_data)));
126 return __x[0];
127 }
128 else
129 return _Base::_S_reduce(__x,
130 static_cast<_BinaryOperation&&>(__binary_op));
131 }
132
133 // }}}
134 // math {{{
135 // _S_sqrt {{{
136 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
137 _GLIBCXX_SIMD_INTRINSIC static _Tp
138 _S_sqrt(_Tp __x)
139 {
140 if constexpr (__have_neon_a64)
141 {
142 const auto __intrin = __to_intrin(__x);
143 if constexpr (_TVT::template _S_is<float, 2>)
144 return vsqrt_f32(__intrin);
145 else if constexpr (_TVT::template _S_is<float, 4>)
146 return vsqrtq_f32(__intrin);
147 else if constexpr (_TVT::template _S_is<double, 1>)
148 return vsqrt_f64(__intrin);
149 else if constexpr (_TVT::template _S_is<double, 2>)
150 return vsqrtq_f64(__intrin);
151 else
152 __assert_unreachable<_Tp>();
153 }
154 else
155 return _Base::_S_sqrt(__x);
156 }
157
158 // }}}
159 // _S_trunc {{{
160 template <typename _TW, typename _TVT = _VectorTraits<_TW>>
161 _GLIBCXX_SIMD_INTRINSIC static _TW
162 _S_trunc(_TW __x)
163 {
164 using _Tp = typename _TVT::value_type;
165 if constexpr (__have_neon_a32)
166 {
167 const auto __intrin = __to_intrin(__x);
168 if constexpr (_TVT::template _S_is<float, 2>)
169 return vrnd_f32(__intrin);
170 else if constexpr (_TVT::template _S_is<float, 4>)
171 return vrndq_f32(__intrin);
172 else if constexpr (_TVT::template _S_is<double, 1>)
173 return vrnd_f64(__intrin);
174 else if constexpr (_TVT::template _S_is<double, 2>)
175 return vrndq_f64(__intrin);
176 else
177 __assert_unreachable<_Tp>();
178 }
179 else if constexpr (is_same_v<_Tp, float>)
180 {
181 auto __intrin = __to_intrin(__x);
182 if constexpr (sizeof(__x) == 16)
183 __intrin = vcvtq_f32_s32(vcvtq_s32_f32(__intrin));
184 else
185 __intrin = vcvt_f32_s32(vcvt_s32_f32(__intrin));
186 return _Base::_S_abs(__x)._M_data < 0x1p23f
187 ? __vector_bitcast<float>(__intrin)
188 : __x._M_data;
189 }
190 else
191 return _Base::_S_trunc(__x);
192 }
193
194 // }}}
195 // _S_round {{{
196 template <typename _Tp, size_t _Np>
197 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
198 _S_round(_SimdWrapper<_Tp, _Np> __x)
199 {
200 if constexpr (__have_neon_a32)
201 {
202 const auto __intrin = __to_intrin(__x);
203 if constexpr (sizeof(_Tp) == 4 && sizeof(__x) == 8)
204 return vrnda_f32(__intrin);
205 else if constexpr (sizeof(_Tp) == 4 && sizeof(__x) == 16)
206 return vrndaq_f32(__intrin);
207 else if constexpr (sizeof(_Tp) == 8 && sizeof(__x) == 8)
208 return vrnda_f64(__intrin);
209 else if constexpr (sizeof(_Tp) == 8 && sizeof(__x) == 16)
210 return vrndaq_f64(__intrin);
211 else
212 __assert_unreachable<_Tp>();
213 }
214 else
215 return _Base::_S_round(__x);
216 }
217
218 // }}}
219 // _S_floor {{{
220 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
221 _GLIBCXX_SIMD_INTRINSIC static _Tp
222 _S_floor(_Tp __x)
223 {
224 if constexpr (__have_neon_a32)
225 {
226 const auto __intrin = __to_intrin(__x);
227 if constexpr (_TVT::template _S_is<float, 2>)
228 return vrndm_f32(__intrin);
229 else if constexpr (_TVT::template _S_is<float, 4>)
230 return vrndmq_f32(__intrin);
231 else if constexpr (_TVT::template _S_is<double, 1>)
232 return vrndm_f64(__intrin);
233 else if constexpr (_TVT::template _S_is<double, 2>)
234 return vrndmq_f64(__intrin);
235 else
236 __assert_unreachable<_Tp>();
237 }
238 else
239 return _Base::_S_floor(__x);
240 }
241
242 // }}}
243 // _S_ceil {{{
244 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
245 _GLIBCXX_SIMD_INTRINSIC static _Tp
246 _S_ceil(_Tp __x)
247 {
248 if constexpr (__have_neon_a32)
249 {
250 const auto __intrin = __to_intrin(__x);
251 if constexpr (_TVT::template _S_is<float, 2>)
252 return vrndp_f32(__intrin);
253 else if constexpr (_TVT::template _S_is<float, 4>)
254 return vrndpq_f32(__intrin);
255 else if constexpr (_TVT::template _S_is<double, 1>)
256 return vrndp_f64(__intrin);
257 else if constexpr (_TVT::template _S_is<double, 2>)
258 return vrndpq_f64(__intrin);
259 else
260 __assert_unreachable<_Tp>();
261 }
262 else
263 return _Base::_S_ceil(__x);
264 }
265
266 //}}} }}}
267 }; // }}}
268// _MaskImplNeonMixin {{{
269struct _MaskImplNeonMixin
270{
271 using _Base = _MaskImplBuiltinMixin;
272
273 template <typename _Tp, size_t _Np>
274 _GLIBCXX_SIMD_INTRINSIC static constexpr _SanitizedBitMask<_Np>
275 _S_to_bits(_SimdWrapper<_Tp, _Np> __x)
276 {
277 if (__builtin_is_constant_evaluated())
278 return _Base::_S_to_bits(__x);
279
280 using _I = __int_for_sizeof_t<_Tp>;
281 if constexpr (sizeof(__x) == 16)
282 {
283 auto __asint = __vector_bitcast<_I>(__x);
284#ifdef __aarch64__
285 [[maybe_unused]] constexpr auto __zero = decltype(__asint)();
286#else
287 [[maybe_unused]] constexpr auto __zero = decltype(__lo64(__asint))();
288#endif
289 if constexpr (sizeof(_Tp) == 1)
290 {
291 constexpr auto __bitsel
292 = __generate_from_n_evaluations<16, __vector_type_t<_I, 16>>(
293 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
294 return static_cast<_I>(
295 __i < _Np ? (__i < 8 ? 1 << __i : 1 << (__i - 8)) : 0);
296 });
297 __asint &= __bitsel;
298#ifdef __aarch64__
299 return __vector_bitcast<_UShort>(
300 vpaddq_s8(vpaddq_s8(vpaddq_s8(__asint, __zero), __zero),
301 __zero))[0];
302#else
303 return __vector_bitcast<_UShort>(
304 vpadd_s8(vpadd_s8(vpadd_s8(__lo64(__asint), __hi64(__asint)),
305 __zero),
306 __zero))[0];
307#endif
308 }
309 else if constexpr (sizeof(_Tp) == 2)
310 {
311 constexpr auto __bitsel
312 = __generate_from_n_evaluations<8, __vector_type_t<_I, 8>>(
313 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
314 return static_cast<_I>(__i < _Np ? 1 << __i : 0);
315 });
316 __asint &= __bitsel;
317#ifdef __aarch64__
318 return vaddvq_s16(__asint);
319#else
320 return vpadd_s16(
321 vpadd_s16(vpadd_s16(__lo64(__asint), __hi64(__asint)), __zero),
322 __zero)[0];
323#endif
324 }
325 else if constexpr (sizeof(_Tp) == 4)
326 {
327 constexpr auto __bitsel
328 = __generate_from_n_evaluations<4, __vector_type_t<_I, 4>>(
329 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
330 return static_cast<_I>(__i < _Np ? 1 << __i : 0);
331 });
332 __asint &= __bitsel;
333#ifdef __aarch64__
334 return vaddvq_s32(__asint);
335#else
336 return vpadd_s32(vpadd_s32(__lo64(__asint), __hi64(__asint)),
337 __zero)[0];
338#endif
339 }
340 else if constexpr (sizeof(_Tp) == 8)
341 return (__asint[0] & 1) | (__asint[1] & 2);
342 else
343 __assert_unreachable<_Tp>();
344 }
345 else if constexpr (sizeof(__x) == 8)
346 {
347 auto __asint = __vector_bitcast<_I>(__x);
348 [[maybe_unused]] constexpr auto __zero = decltype(__asint)();
349 if constexpr (sizeof(_Tp) == 1)
350 {
351 constexpr auto __bitsel
352 = __generate_from_n_evaluations<8, __vector_type_t<_I, 8>>(
353 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
354 return static_cast<_I>(__i < _Np ? 1 << __i : 0);
355 });
356 __asint &= __bitsel;
357#ifdef __aarch64__
358 return vaddv_s8(__asint);
359#else
360 return vpadd_s8(vpadd_s8(vpadd_s8(__asint, __zero), __zero),
361 __zero)[0];
362#endif
363 }
364 else if constexpr (sizeof(_Tp) == 2)
365 {
366 constexpr auto __bitsel
367 = __generate_from_n_evaluations<4, __vector_type_t<_I, 4>>(
368 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
369 return static_cast<_I>(__i < _Np ? 1 << __i : 0);
370 });
371 __asint &= __bitsel;
372#ifdef __aarch64__
373 return vaddv_s16(__asint);
374#else
375 return vpadd_s16(vpadd_s16(__asint, __zero), __zero)[0];
376#endif
377 }
378 else if constexpr (sizeof(_Tp) == 4)
379 {
380 __asint &= __make_vector<_I>(0x1, 0x2);
381#ifdef __aarch64__
382 return vaddv_s32(__asint);
383#else
384 return vpadd_s32(__asint, __zero)[0];
385#endif
386 }
387 else
388 __assert_unreachable<_Tp>();
389 }
390 else
391 return _Base::_S_to_bits(__x);
392 }
393};
394
395// }}}
396// _MaskImplNeon {{{
397template <typename _Abi, typename>
398 struct _MaskImplNeon : _MaskImplNeonMixin, _MaskImplBuiltin<_Abi>
399 {
400 using _MaskImplBuiltinMixin::_S_to_maskvector;
401 using _MaskImplNeonMixin::_S_to_bits;
402 using _Base = _MaskImplBuiltin<_Abi>;
403 using _Base::_S_convert;
404
405 // _S_all_of {{{
406 template <typename _Tp>
407 _GLIBCXX_SIMD_INTRINSIC static bool
408 _S_all_of(simd_mask<_Tp, _Abi> __k)
409 {
410 const auto __kk
411 = __vector_bitcast<char>(__k._M_data)
412 | ~__vector_bitcast<char>(_Abi::template _S_implicit_mask<_Tp>());
413 if constexpr (sizeof(__k) == 16)
414 {
415 const auto __x = __vector_bitcast<long long>(__kk);
416 return __x[0] + __x[1] == -2;
417 }
418 else if constexpr (sizeof(__k) <= 8)
419 return __bit_cast<__int_for_sizeof_t<decltype(__kk)>>(__kk) == -1;
420 else
421 __assert_unreachable<_Tp>();
422 }
423
424 // }}}
425 // _S_any_of {{{
426 template <typename _Tp>
427 _GLIBCXX_SIMD_INTRINSIC static bool
428 _S_any_of(simd_mask<_Tp, _Abi> __k)
429 {
430 const auto __kk
431 = __vector_bitcast<char>(__k._M_data)
432 | ~__vector_bitcast<char>(_Abi::template _S_implicit_mask<_Tp>());
433 if constexpr (sizeof(__k) == 16)
434 {
435 const auto __x = __vector_bitcast<long long>(__kk);
436 return (__x[0] | __x[1]) != 0;
437 }
438 else if constexpr (sizeof(__k) <= 8)
439 return __bit_cast<__int_for_sizeof_t<decltype(__kk)>>(__kk) != 0;
440 else
441 __assert_unreachable<_Tp>();
442 }
443
444 // }}}
445 // _S_none_of {{{
446 template <typename _Tp>
447 _GLIBCXX_SIMD_INTRINSIC static bool
448 _S_none_of(simd_mask<_Tp, _Abi> __k)
449 {
450 const auto __kk = _Abi::_S_masked(__k._M_data);
451 if constexpr (sizeof(__k) == 16)
452 {
453 const auto __x = __vector_bitcast<long long>(__kk);
454 return (__x[0] | __x[1]) == 0;
455 }
456 else if constexpr (sizeof(__k) <= 8)
457 return __bit_cast<__int_for_sizeof_t<decltype(__kk)>>(__kk) == 0;
458 else
459 __assert_unreachable<_Tp>();
460 }
461
462 // }}}
463 // _S_some_of {{{
464 template <typename _Tp>
465 _GLIBCXX_SIMD_INTRINSIC static bool _S_some_of(simd_mask<_Tp, _Abi> __k)
466 {
467 if constexpr (sizeof(__k) <= 8)
468 {
469 const auto __kk = __vector_bitcast<char>(__k._M_data)
470 | ~__vector_bitcast<char>(
471 _Abi::template _S_implicit_mask<_Tp>());
472 using _Up = make_unsigned_t<__int_for_sizeof_t<decltype(__kk)>>;
473 return __bit_cast<_Up>(__kk) + 1 > 1;
474 }
475 else
476 return _Base::_S_some_of(__k);
477 }
478
479 // }}}
480 // _S_popcount {{{
481 template <typename _Tp>
482 _GLIBCXX_SIMD_INTRINSIC static int
483 _S_popcount(simd_mask<_Tp, _Abi> __k)
484 {
485 if constexpr (sizeof(_Tp) == 1)
486 {
487 const auto __s8 = __vector_bitcast<_SChar>(__k._M_data);
488 int8x8_t __tmp = __lo64(__s8) + __hi64z(__s8);
489 return -vpadd_s8(vpadd_s8(vpadd_s8(__tmp, int8x8_t()), int8x8_t()),
490 int8x8_t())[0];
491 }
492 else if constexpr (sizeof(_Tp) == 2)
493 {
494 const auto __s16 = __vector_bitcast<short>(__k._M_data);
495 int16x4_t __tmp = __lo64(__s16) + __hi64z(__s16);
496 return -vpadd_s16(vpadd_s16(__tmp, int16x4_t()), int16x4_t())[0];
497 }
498 else if constexpr (sizeof(_Tp) == 4)
499 {
500 const auto __s32 = __vector_bitcast<int>(__k._M_data);
501 int32x2_t __tmp = __lo64(__s32) + __hi64z(__s32);
502 return -vpadd_s32(__tmp, int32x2_t())[0];
503 }
504 else if constexpr (sizeof(_Tp) == 8)
505 {
506 static_assert(sizeof(__k) == 16);
507 const auto __s64 = __vector_bitcast<long>(__k._M_data);
508 return -(__s64[0] + __s64[1]);
509 }
510 }
511
512 // }}}
513 // _S_find_first_set {{{
514 template <typename _Tp>
515 _GLIBCXX_SIMD_INTRINSIC static int
516 _S_find_first_set(simd_mask<_Tp, _Abi> __k)
517 {
518 // TODO: the _Base implementation is not optimal for NEON
519 return _Base::_S_find_first_set(__k);
520 }
521
522 // }}}
523 // _S_find_last_set {{{
524 template <typename _Tp>
525 _GLIBCXX_SIMD_INTRINSIC static int
526 _S_find_last_set(simd_mask<_Tp, _Abi> __k)
527 {
528 // TODO: the _Base implementation is not optimal for NEON
529 return _Base::_S_find_last_set(__k);
530 }
531
532 // }}}
533 }; // }}}
534
535_GLIBCXX_SIMD_END_NAMESPACE
536#endif // __cplusplus >= 201703L
537#endif // _GLIBCXX_EXPERIMENTAL_SIMD_NEON_H_
538// vim: foldmethod=marker sw=2 noet ts=8 sts=2 tw=80
typename make_unsigned< _Tp >::type make_unsigned_t
Alias template for make_unsigned.
Definition: type_traits:1983