libstdc++
simd_detail.h
1// Internal macros for the simd implementation -*- C++ -*-
2
3// Copyright (C) 2020-2021 Free Software Foundation, Inc.
4//
5// This file is part of the GNU ISO C++ Library. This library is free
6// software; you can redistribute it and/or modify it under the
7// terms of the GNU General Public License as published by the
8// Free Software Foundation; either version 3, or (at your option)
9// any later version.
10
11// This library is distributed in the hope that it will be useful,
12// but WITHOUT ANY WARRANTY; without even the implied warranty of
13// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14// GNU General Public License for more details.
15
16// Under Section 7 of GPL version 3, you are granted additional
17// permissions described in the GCC Runtime Library Exception, version
18// 3.1, as published by the Free Software Foundation.
19
20// You should have received a copy of the GNU General Public License and
21// a copy of the GCC Runtime Library Exception along with this program;
22// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23// <http://www.gnu.org/licenses/>.
24
25#ifndef _GLIBCXX_EXPERIMENTAL_SIMD_DETAIL_H_
26#define _GLIBCXX_EXPERIMENTAL_SIMD_DETAIL_H_
27
28#if __cplusplus >= 201703L
29
30#include <cstddef>
31#include <cstdint>
32
33/// @cond undocumented
34
35#define _GLIBCXX_SIMD_BEGIN_NAMESPACE \
36 namespace std _GLIBCXX_VISIBILITY(default) \
37 { \
38 _GLIBCXX_BEGIN_NAMESPACE_VERSION \
39 namespace experimental { \
40 inline namespace parallelism_v2 {
41#define _GLIBCXX_SIMD_END_NAMESPACE \
42 } \
43 } \
44 _GLIBCXX_END_NAMESPACE_VERSION \
45 }
46
47// ISA extension detection. The following defines all the _GLIBCXX_SIMD_HAVE_XXX
48// macros ARM{{{
49#if defined __ARM_NEON
50#define _GLIBCXX_SIMD_HAVE_NEON 1
51#else
52#define _GLIBCXX_SIMD_HAVE_NEON 0
53#endif
54#if defined __ARM_NEON && (__ARM_ARCH >= 8 || defined __aarch64__)
55#define _GLIBCXX_SIMD_HAVE_NEON_A32 1
56#else
57#define _GLIBCXX_SIMD_HAVE_NEON_A32 0
58#endif
59#if defined __ARM_NEON && defined __aarch64__
60#define _GLIBCXX_SIMD_HAVE_NEON_A64 1
61#else
62#define _GLIBCXX_SIMD_HAVE_NEON_A64 0
63#endif
64//}}}
65// x86{{{
66#ifdef __MMX__
67#define _GLIBCXX_SIMD_HAVE_MMX 1
68#else
69#define _GLIBCXX_SIMD_HAVE_MMX 0
70#endif
71#if defined __SSE__ || defined __x86_64__
72#define _GLIBCXX_SIMD_HAVE_SSE 1
73#else
74#define _GLIBCXX_SIMD_HAVE_SSE 0
75#endif
76#if defined __SSE2__ || defined __x86_64__
77#define _GLIBCXX_SIMD_HAVE_SSE2 1
78#else
79#define _GLIBCXX_SIMD_HAVE_SSE2 0
80#endif
81#ifdef __SSE3__
82#define _GLIBCXX_SIMD_HAVE_SSE3 1
83#else
84#define _GLIBCXX_SIMD_HAVE_SSE3 0
85#endif
86#ifdef __SSSE3__
87#define _GLIBCXX_SIMD_HAVE_SSSE3 1
88#else
89#define _GLIBCXX_SIMD_HAVE_SSSE3 0
90#endif
91#ifdef __SSE4_1__
92#define _GLIBCXX_SIMD_HAVE_SSE4_1 1
93#else
94#define _GLIBCXX_SIMD_HAVE_SSE4_1 0
95#endif
96#ifdef __SSE4_2__
97#define _GLIBCXX_SIMD_HAVE_SSE4_2 1
98#else
99#define _GLIBCXX_SIMD_HAVE_SSE4_2 0
100#endif
101#ifdef __XOP__
102#define _GLIBCXX_SIMD_HAVE_XOP 1
103#else
104#define _GLIBCXX_SIMD_HAVE_XOP 0
105#endif
106#ifdef __AVX__
107#define _GLIBCXX_SIMD_HAVE_AVX 1
108#else
109#define _GLIBCXX_SIMD_HAVE_AVX 0
110#endif
111#ifdef __AVX2__
112#define _GLIBCXX_SIMD_HAVE_AVX2 1
113#else
114#define _GLIBCXX_SIMD_HAVE_AVX2 0
115#endif
116#ifdef __BMI__
117#define _GLIBCXX_SIMD_HAVE_BMI1 1
118#else
119#define _GLIBCXX_SIMD_HAVE_BMI1 0
120#endif
121#ifdef __BMI2__
122#define _GLIBCXX_SIMD_HAVE_BMI2 1
123#else
124#define _GLIBCXX_SIMD_HAVE_BMI2 0
125#endif
126#ifdef __LZCNT__
127#define _GLIBCXX_SIMD_HAVE_LZCNT 1
128#else
129#define _GLIBCXX_SIMD_HAVE_LZCNT 0
130#endif
131#ifdef __SSE4A__
132#define _GLIBCXX_SIMD_HAVE_SSE4A 1
133#else
134#define _GLIBCXX_SIMD_HAVE_SSE4A 0
135#endif
136#ifdef __FMA__
137#define _GLIBCXX_SIMD_HAVE_FMA 1
138#else
139#define _GLIBCXX_SIMD_HAVE_FMA 0
140#endif
141#ifdef __FMA4__
142#define _GLIBCXX_SIMD_HAVE_FMA4 1
143#else
144#define _GLIBCXX_SIMD_HAVE_FMA4 0
145#endif
146#ifdef __F16C__
147#define _GLIBCXX_SIMD_HAVE_F16C 1
148#else
149#define _GLIBCXX_SIMD_HAVE_F16C 0
150#endif
151#ifdef __POPCNT__
152#define _GLIBCXX_SIMD_HAVE_POPCNT 1
153#else
154#define _GLIBCXX_SIMD_HAVE_POPCNT 0
155#endif
156#ifdef __AVX512F__
157#define _GLIBCXX_SIMD_HAVE_AVX512F 1
158#else
159#define _GLIBCXX_SIMD_HAVE_AVX512F 0
160#endif
161#ifdef __AVX512DQ__
162#define _GLIBCXX_SIMD_HAVE_AVX512DQ 1
163#else
164#define _GLIBCXX_SIMD_HAVE_AVX512DQ 0
165#endif
166#ifdef __AVX512VL__
167#define _GLIBCXX_SIMD_HAVE_AVX512VL 1
168#else
169#define _GLIBCXX_SIMD_HAVE_AVX512VL 0
170#endif
171#ifdef __AVX512BW__
172#define _GLIBCXX_SIMD_HAVE_AVX512BW 1
173#else
174#define _GLIBCXX_SIMD_HAVE_AVX512BW 0
175#endif
176
177#if _GLIBCXX_SIMD_HAVE_SSE
178#define _GLIBCXX_SIMD_HAVE_SSE_ABI 1
179#else
180#define _GLIBCXX_SIMD_HAVE_SSE_ABI 0
181#endif
182#if _GLIBCXX_SIMD_HAVE_SSE2
183#define _GLIBCXX_SIMD_HAVE_FULL_SSE_ABI 1
184#else
185#define _GLIBCXX_SIMD_HAVE_FULL_SSE_ABI 0
186#endif
187
188#if _GLIBCXX_SIMD_HAVE_AVX
189#define _GLIBCXX_SIMD_HAVE_AVX_ABI 1
190#else
191#define _GLIBCXX_SIMD_HAVE_AVX_ABI 0
192#endif
193#if _GLIBCXX_SIMD_HAVE_AVX2
194#define _GLIBCXX_SIMD_HAVE_FULL_AVX_ABI 1
195#else
196#define _GLIBCXX_SIMD_HAVE_FULL_AVX_ABI 0
197#endif
198
199#if _GLIBCXX_SIMD_HAVE_AVX512F
200#define _GLIBCXX_SIMD_HAVE_AVX512_ABI 1
201#else
202#define _GLIBCXX_SIMD_HAVE_AVX512_ABI 0
203#endif
204#if _GLIBCXX_SIMD_HAVE_AVX512BW
205#define _GLIBCXX_SIMD_HAVE_FULL_AVX512_ABI 1
206#else
207#define _GLIBCXX_SIMD_HAVE_FULL_AVX512_ABI 0
208#endif
209
210#if defined __x86_64__ && !_GLIBCXX_SIMD_HAVE_SSE2
211#error "Use of SSE2 is required on AMD64"
212#endif
213//}}}
214
215#ifdef __clang__
216#define _GLIBCXX_SIMD_NORMAL_MATH
217#define _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA
218#else
219#define _GLIBCXX_SIMD_NORMAL_MATH \
220 [[__gnu__::__optimize__("finite-math-only,no-signed-zeros")]]
221#define _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA __attribute__((__always_inline__))
222#endif
223#define _GLIBCXX_SIMD_NEVER_INLINE [[__gnu__::__noinline__]]
224#define _GLIBCXX_SIMD_INTRINSIC \
225 [[__gnu__::__always_inline__, __gnu__::__artificial__]] inline
226#define _GLIBCXX_SIMD_ALWAYS_INLINE [[__gnu__::__always_inline__]] inline
227#define _GLIBCXX_SIMD_IS_UNLIKELY(__x) __builtin_expect(__x, 0)
228#define _GLIBCXX_SIMD_IS_LIKELY(__x) __builtin_expect(__x, 1)
229
230#if __STRICT_ANSI__ || defined __clang__
231#define _GLIBCXX_SIMD_CONSTEXPR
232#define _GLIBCXX_SIMD_USE_CONSTEXPR_API const
233#else
234#define _GLIBCXX_SIMD_CONSTEXPR constexpr
235#define _GLIBCXX_SIMD_USE_CONSTEXPR_API constexpr
236#endif
237
238#if defined __clang__
239#define _GLIBCXX_SIMD_USE_CONSTEXPR const
240#else
241#define _GLIBCXX_SIMD_USE_CONSTEXPR constexpr
242#endif
243
244#define _GLIBCXX_SIMD_LIST_BINARY(__macro) __macro(|) __macro(&) __macro(^)
245#define _GLIBCXX_SIMD_LIST_SHIFTS(__macro) __macro(<<) __macro(>>)
246#define _GLIBCXX_SIMD_LIST_ARITHMETICS(__macro) \
247 __macro(+) __macro(-) __macro(*) __macro(/) __macro(%)
248
249#define _GLIBCXX_SIMD_ALL_BINARY(__macro) \
250 _GLIBCXX_SIMD_LIST_BINARY(__macro) static_assert(true)
251#define _GLIBCXX_SIMD_ALL_SHIFTS(__macro) \
252 _GLIBCXX_SIMD_LIST_SHIFTS(__macro) static_assert(true)
253#define _GLIBCXX_SIMD_ALL_ARITHMETICS(__macro) \
254 _GLIBCXX_SIMD_LIST_ARITHMETICS(__macro) static_assert(true)
255
256#ifdef _GLIBCXX_SIMD_NO_ALWAYS_INLINE
257#undef _GLIBCXX_SIMD_ALWAYS_INLINE
258#define _GLIBCXX_SIMD_ALWAYS_INLINE inline
259#undef _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA
260#define _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA
261#undef _GLIBCXX_SIMD_INTRINSIC
262#define _GLIBCXX_SIMD_INTRINSIC inline
263#endif
264
265#if _GLIBCXX_SIMD_HAVE_SSE || _GLIBCXX_SIMD_HAVE_MMX
266#define _GLIBCXX_SIMD_X86INTRIN 1
267#else
268#define _GLIBCXX_SIMD_X86INTRIN 0
269#endif
270
271// workaround macros {{{
272// use aliasing loads to help GCC understand the data accesses better
273// This also seems to hide a miscompilation on swap(x[i], x[i + 1]) with
274// fixed_size_simd<float, 16> x.
275#define _GLIBCXX_SIMD_USE_ALIASING_LOADS 1
276
277// vector conversions on x86 not optimized:
278#if _GLIBCXX_SIMD_X86INTRIN
279#define _GLIBCXX_SIMD_WORKAROUND_PR85048 1
280#endif
281
282// integer division not optimized
283#ifndef __clang__
284#define _GLIBCXX_SIMD_WORKAROUND_PR90993 1
285#endif
286
287// very bad codegen for extraction and concatenation of 128/256 "subregisters"
288// with sizeof(element type) < 8: https://godbolt.org/g/mqUsgM
289#if _GLIBCXX_SIMD_X86INTRIN
290#define _GLIBCXX_SIMD_WORKAROUND_XXX_1 1
291#endif
292
293// bad codegen for 8 Byte memcpy to __vector_type_t<char, 16>
294#define _GLIBCXX_SIMD_WORKAROUND_PR90424 1
295
296// bad codegen for zero-extend using simple concat(__x, 0)
297#if _GLIBCXX_SIMD_X86INTRIN
298#define _GLIBCXX_SIMD_WORKAROUND_XXX_3 1
299#endif
300
301// https://github.com/cplusplus/parallelism-ts/issues/65 (incorrect return type
302// of static_simd_cast)
303#define _GLIBCXX_SIMD_FIX_P2TS_ISSUE65 1
304
305// https://github.com/cplusplus/parallelism-ts/issues/66 (incorrect SFINAE
306// constraint on (static)_simd_cast)
307#define _GLIBCXX_SIMD_FIX_P2TS_ISSUE66 1
308// }}}
309
310/// @endcond
311
312#endif // __cplusplus >= 201703L
313#endif // _GLIBCXX_EXPERIMENTAL_SIMD_DETAIL_H_
314
315// vim: foldmethod=marker