25#ifndef _GLIBCXX_EXPERIMENTAL_SIMD_X86_CONVERSIONS_H
26#define _GLIBCXX_EXPERIMENTAL_SIMD_X86_CONVERSIONS_H
28#if __cplusplus >= 201703L
32template <
typename _To,
typename _V,
typename _Traits>
33 _GLIBCXX_SIMD_INTRINSIC _To
36 static_assert(__is_vector_type_v<_V>);
37 using _Tp =
typename _Traits::value_type;
38 constexpr size_t _Np = _Traits::_S_full_size;
39 [[maybe_unused]]
const auto __intrin = __to_intrin(__v);
40 using _Up =
typename _VectorTraits<_To>::value_type;
41 constexpr size_t _M = _VectorTraits<_To>::_S_full_size;
44 [[maybe_unused]]
constexpr bool __x_to_x
45 =
sizeof(__v) <= 16 &&
sizeof(_To) <= 16;
46 [[maybe_unused]]
constexpr bool __x_to_y
47 =
sizeof(__v) <= 16 &&
sizeof(_To) == 32;
48 [[maybe_unused]]
constexpr bool __x_to_z
49 =
sizeof(__v) <= 16 &&
sizeof(_To) == 64;
50 [[maybe_unused]]
constexpr bool __y_to_x
51 =
sizeof(__v) == 32 &&
sizeof(_To) <= 16;
52 [[maybe_unused]]
constexpr bool __y_to_y
53 =
sizeof(__v) == 32 &&
sizeof(_To) == 32;
54 [[maybe_unused]]
constexpr bool __y_to_z
55 =
sizeof(__v) == 32 &&
sizeof(_To) == 64;
56 [[maybe_unused]]
constexpr bool __z_to_x
57 =
sizeof(__v) == 64 &&
sizeof(_To) <= 16;
58 [[maybe_unused]]
constexpr bool __z_to_y
59 =
sizeof(__v) == 64 &&
sizeof(_To) == 32;
60 [[maybe_unused]]
constexpr bool __z_to_z
61 =
sizeof(__v) == 64 &&
sizeof(_To) == 64;
64 [[maybe_unused]]
constexpr bool __i_to_i
65 = is_integral_v<_Up> && is_integral_v<_Tp>;
66 [[maybe_unused]]
constexpr bool __i8_to_i16
67 = __i_to_i &&
sizeof(_Tp) == 1 &&
sizeof(_Up) == 2;
68 [[maybe_unused]]
constexpr bool __i8_to_i32
69 = __i_to_i &&
sizeof(_Tp) == 1 &&
sizeof(_Up) == 4;
70 [[maybe_unused]]
constexpr bool __i8_to_i64
71 = __i_to_i &&
sizeof(_Tp) == 1 &&
sizeof(_Up) == 8;
72 [[maybe_unused]]
constexpr bool __i16_to_i8
73 = __i_to_i &&
sizeof(_Tp) == 2 &&
sizeof(_Up) == 1;
74 [[maybe_unused]]
constexpr bool __i16_to_i32
75 = __i_to_i &&
sizeof(_Tp) == 2 &&
sizeof(_Up) == 4;
76 [[maybe_unused]]
constexpr bool __i16_to_i64
77 = __i_to_i &&
sizeof(_Tp) == 2 &&
sizeof(_Up) == 8;
78 [[maybe_unused]]
constexpr bool __i32_to_i8
79 = __i_to_i &&
sizeof(_Tp) == 4 &&
sizeof(_Up) == 1;
80 [[maybe_unused]]
constexpr bool __i32_to_i16
81 = __i_to_i &&
sizeof(_Tp) == 4 &&
sizeof(_Up) == 2;
82 [[maybe_unused]]
constexpr bool __i32_to_i64
83 = __i_to_i &&
sizeof(_Tp) == 4 &&
sizeof(_Up) == 8;
84 [[maybe_unused]]
constexpr bool __i64_to_i8
85 = __i_to_i &&
sizeof(_Tp) == 8 &&
sizeof(_Up) == 1;
86 [[maybe_unused]]
constexpr bool __i64_to_i16
87 = __i_to_i &&
sizeof(_Tp) == 8 &&
sizeof(_Up) == 2;
88 [[maybe_unused]]
constexpr bool __i64_to_i32
89 = __i_to_i &&
sizeof(_Tp) == 8 &&
sizeof(_Up) == 4;
93 [[maybe_unused]]
constexpr bool __s64_to_f32
94 = is_integral_v<_Tp> && is_signed_v<_Tp> &&
sizeof(_Tp) == 8
95 && is_floating_point_v<_Up> &&
sizeof(_Up) == 4;
96 [[maybe_unused]]
constexpr bool __s32_to_f32
97 = is_integral_v<_Tp> && is_signed_v<_Tp> &&
sizeof(_Tp) == 4
98 && is_floating_point_v<_Up> &&
sizeof(_Up) == 4;
99 [[maybe_unused]]
constexpr bool __s16_to_f32
100 = is_integral_v<_Tp> && is_signed_v<_Tp> &&
sizeof(_Tp) == 2
101 && is_floating_point_v<_Up> &&
sizeof(_Up) == 4;
102 [[maybe_unused]]
constexpr bool __s8_to_f32
103 = is_integral_v<_Tp> && is_signed_v<_Tp> &&
sizeof(_Tp) == 1
104 && is_floating_point_v<_Up> &&
sizeof(_Up) == 4;
105 [[maybe_unused]]
constexpr bool __u64_to_f32
106 = is_integral_v<_Tp> && is_unsigned_v<_Tp> &&
sizeof(_Tp) == 8
107 && is_floating_point_v<_Up> &&
sizeof(_Up) == 4;
108 [[maybe_unused]]
constexpr bool __u32_to_f32
109 = is_integral_v<_Tp> && is_unsigned_v<_Tp> &&
sizeof(_Tp) == 4
110 && is_floating_point_v<_Up> &&
sizeof(_Up) == 4;
111 [[maybe_unused]]
constexpr bool __u16_to_f32
112 = is_integral_v<_Tp> && is_unsigned_v<_Tp> &&
sizeof(_Tp) == 2
113 && is_floating_point_v<_Up> &&
sizeof(_Up) == 4;
114 [[maybe_unused]]
constexpr bool __u8_to_f32
115 = is_integral_v<_Tp> && is_unsigned_v<_Tp> &&
sizeof(_Tp) == 1
116 && is_floating_point_v<_Up> &&
sizeof(_Up) == 4;
117 [[maybe_unused]]
constexpr bool __s64_to_f64
118 = is_integral_v<_Tp> && is_signed_v<_Tp> &&
sizeof(_Tp) == 8
119 && is_floating_point_v<_Up> &&
sizeof(_Up) == 8;
120 [[maybe_unused]]
constexpr bool __s32_to_f64
121 = is_integral_v<_Tp> && is_signed_v<_Tp> &&
sizeof(_Tp) == 4
122 && is_floating_point_v<_Up> &&
sizeof(_Up) == 8;
123 [[maybe_unused]]
constexpr bool __u64_to_f64
124 = is_integral_v<_Tp> && is_unsigned_v<_Tp> &&
sizeof(_Tp) == 8
125 && is_floating_point_v<_Up> &&
sizeof(_Up) == 8;
126 [[maybe_unused]]
constexpr bool __u32_to_f64
127 = is_integral_v<_Tp> && is_unsigned_v<_Tp> &&
sizeof(_Tp) == 4
128 && is_floating_point_v<_Up> &&
sizeof(_Up) == 8;
129 [[maybe_unused]]
constexpr bool __f32_to_s64
130 = is_integral_v<_Up> && is_signed_v<_Up> &&
sizeof(_Up) == 8
131 && is_floating_point_v<_Tp> &&
sizeof(_Tp) == 4;
132 [[maybe_unused]]
constexpr bool __f32_to_s32
133 = is_integral_v<_Up> && is_signed_v<_Up> &&
sizeof(_Up) == 4
134 && is_floating_point_v<_Tp> &&
sizeof(_Tp) == 4;
135 [[maybe_unused]]
constexpr bool __f32_to_u64
136 = is_integral_v<_Up> && is_unsigned_v<_Up> &&
sizeof(_Up) == 8
137 && is_floating_point_v<_Tp> &&
sizeof(_Tp) == 4;
138 [[maybe_unused]]
constexpr bool __f32_to_u32
139 = is_integral_v<_Up> && is_unsigned_v<_Up> &&
sizeof(_Up) == 4
140 && is_floating_point_v<_Tp> &&
sizeof(_Tp) == 4;
141 [[maybe_unused]]
constexpr bool __f64_to_s64
142 = is_integral_v<_Up> && is_signed_v<_Up> &&
sizeof(_Up) == 8
143 && is_floating_point_v<_Tp> &&
sizeof(_Tp) == 8;
144 [[maybe_unused]]
constexpr bool __f64_to_s32
145 = is_integral_v<_Up> && is_signed_v<_Up> &&
sizeof(_Up) == 4
146 && is_floating_point_v<_Tp> &&
sizeof(_Tp) == 8;
147 [[maybe_unused]]
constexpr bool __f64_to_u64
148 = is_integral_v<_Up> && is_unsigned_v<_Up> &&
sizeof(_Up) == 8
149 && is_floating_point_v<_Tp> &&
sizeof(_Tp) == 8;
150 [[maybe_unused]]
constexpr bool __f64_to_u32
151 = is_integral_v<_Up> && is_unsigned_v<_Up> &&
sizeof(_Up) == 4
152 && is_floating_point_v<_Tp> &&
sizeof(_Tp) == 8;
153 [[maybe_unused]]
constexpr bool __ibw_to_f32
154 = is_integral_v<_Tp> &&
sizeof(_Tp) <= 2
155 && is_floating_point_v<_Up> &&
sizeof(_Up) == 4;
156 [[maybe_unused]]
constexpr bool __ibw_to_f64
157 = is_integral_v<_Tp> &&
sizeof(_Tp) <= 2
158 && is_floating_point_v<_Up> &&
sizeof(_Up) == 8;
159 [[maybe_unused]]
constexpr bool __f32_to_ibw
160 = is_integral_v<_Up> &&
sizeof(_Up) <= 2
161 && is_floating_point_v<_Tp> &&
sizeof(_Tp) == 4;
162 [[maybe_unused]]
constexpr bool __f64_to_ibw
163 = is_integral_v<_Up> &&
sizeof(_Up) <= 2
164 && is_floating_point_v<_Tp> &&
sizeof(_Tp) == 8;
165 [[maybe_unused]]
constexpr bool __f32_to_f64
166 = is_floating_point_v<_Tp> &&
sizeof(_Tp) == 4
167 && is_floating_point_v<_Up> &&
sizeof(_Up) == 8;
168 [[maybe_unused]]
constexpr bool __f64_to_f32
169 = is_floating_point_v<_Tp> &&
sizeof(_Tp) == 8
170 && is_floating_point_v<_Up> &&
sizeof(_Up) == 4;
172 if constexpr (__i_to_i && __y_to_x && !__have_avx2)
173 return __convert_x86<_To>(__lo128(__v), __hi128(__v));
174 else if constexpr (__i_to_i && __x_to_y && !__have_avx2)
175 return __concat(__convert_x86<__vector_type_t<_Up, _M / 2>>(__v),
176 __convert_x86<__vector_type_t<_Up, _M / 2>>(
177 __extract_part<1, _Np / _M * 2>(__v)));
178 else if constexpr (__i_to_i)
180 static_assert(__x_to_x || __have_avx2,
181 "integral conversions with ymm registers require AVX2");
182 static_assert(__have_avx512bw
183 || ((
sizeof(_Tp) >= 4 ||
sizeof(__v) < 64)
184 && (
sizeof(_Up) >= 4 ||
sizeof(_To) < 64)),
185 "8/16-bit integers in zmm registers require AVX512BW");
186 static_assert((
sizeof(__v) < 64 &&
sizeof(_To) < 64) || __have_avx512f,
187 "integral conversions with ymm registers require AVX2");
189 if constexpr (is_floating_point_v<_Tp> == is_floating_point_v<_Up> &&
190 sizeof(_Tp) ==
sizeof(_Up))
193 if constexpr (_Np >= _M)
194 return __intrin_bitcast<_To>(__v);
196 return __zero_extend(__vector_bitcast<_Up>(__v));
198 else if constexpr (_Np < _M &&
sizeof(_To) > 16)
200 return __zero_extend(
201 __convert_x86<__vector_type_t<
202 _Up, (16 /
sizeof(_Up) > _Np) ? 16 /
sizeof(_Up) : _Np>>(__v));
203 else if constexpr (_Np > _M &&
sizeof(__v) > 16)
205 return __convert_x86<_To>(__extract_part<0, _Np / _M>(__v));
206 else if constexpr (__i64_to_i32)
208 if constexpr (__x_to_x && __have_avx512vl)
209 return __intrin_bitcast<_To>(_mm_cvtepi64_epi32(__intrin));
210 else if constexpr (__x_to_x)
211 return __auto_bitcast(
212 _mm_shuffle_ps(__vector_bitcast<float>(__v), __m128(), 8));
213 else if constexpr (__y_to_x && __have_avx512vl)
214 return __intrin_bitcast<_To>(_mm256_cvtepi64_epi32(__intrin));
215 else if constexpr (__y_to_x && __have_avx512f)
216 return __intrin_bitcast<_To>(
217 __lo128(_mm512_cvtepi64_epi32(__auto_bitcast(__v))));
218 else if constexpr (__y_to_x)
219 return __intrin_bitcast<_To>(
220 __lo128(_mm256_permute4x64_epi64(_mm256_shuffle_epi32(__intrin, 8),
222 else if constexpr (__z_to_y)
223 return __intrin_bitcast<_To>(_mm512_cvtepi64_epi32(__intrin));
225 else if constexpr (__i64_to_i16)
227 if constexpr (__x_to_x && __have_avx512vl)
228 return __intrin_bitcast<_To>(_mm_cvtepi64_epi16(__intrin));
229 else if constexpr (__x_to_x && __have_avx512f)
230 return __intrin_bitcast<_To>(
231 __lo128(_mm512_cvtepi64_epi16(__auto_bitcast(__v))));
232 else if constexpr (__x_to_x && __have_ssse3)
234 return __intrin_bitcast<_To>(
235 _mm_shuffle_epi8(__intrin,
236 _mm_setr_epi8(0, 1, 8, 9, -0x80, -0x80, -0x80,
237 -0x80, -0x80, -0x80, -0x80, -0x80,
238 -0x80, -0x80, -0x80, -0x80)));
241 else if constexpr (__y_to_x && __have_avx512vl)
242 return __intrin_bitcast<_To>(_mm256_cvtepi64_epi16(__intrin));
243 else if constexpr (__y_to_x && __have_avx512f)
244 return __intrin_bitcast<_To>(
245 __lo128(_mm512_cvtepi64_epi16(__auto_bitcast(__v))));
246 else if constexpr (__y_to_x)
248 const auto __a = _mm256_shuffle_epi8(
250 _mm256_setr_epi8(0, 1, 8, 9, -0x80, -0x80, -0x80, -0x80, -0x80,
251 -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80,
252 -0x80, -0x80, -0x80, -0x80, 0, 1, 8, 9, -0x80,
253 -0x80, -0x80, -0x80, -0x80, -0x80, -0x80,
255 return __intrin_bitcast<_To>(__lo128(__a) | __hi128(__a));
257 else if constexpr (__z_to_x)
258 return __intrin_bitcast<_To>(_mm512_cvtepi64_epi16(__intrin));
260 else if constexpr (__i64_to_i8)
262 if constexpr (__x_to_x && __have_avx512vl)
263 return __intrin_bitcast<_To>(_mm_cvtepi64_epi8(__intrin));
264 else if constexpr (__x_to_x && __have_avx512f)
265 return __intrin_bitcast<_To>(
266 __lo128(_mm512_cvtepi64_epi8(__zero_extend(__intrin))));
267 else if constexpr (__y_to_x && __have_avx512vl)
268 return __intrin_bitcast<_To>(_mm256_cvtepi64_epi8(__intrin));
269 else if constexpr (__y_to_x && __have_avx512f)
270 return __intrin_bitcast<_To>(
271 _mm512_cvtepi64_epi8(__zero_extend(__intrin)));
272 else if constexpr (__z_to_x)
273 return __intrin_bitcast<_To>(_mm512_cvtepi64_epi8(__intrin));
275 else if constexpr (__i32_to_i64)
277 if constexpr (__have_sse4_1 && __x_to_x)
278 return __intrin_bitcast<_To>(is_signed_v<_Tp>
279 ? _mm_cvtepi32_epi64(__intrin)
280 : _mm_cvtepu32_epi64(__intrin));
281 else if constexpr (__x_to_x)
283 return __intrin_bitcast<_To>(
284 _mm_unpacklo_epi32(__intrin, is_signed_v<_Tp>
285 ? _mm_srai_epi32(__intrin, 31)
288 else if constexpr (__x_to_y)
289 return __intrin_bitcast<_To>(is_signed_v<_Tp>
290 ? _mm256_cvtepi32_epi64(__intrin)
291 : _mm256_cvtepu32_epi64(__intrin));
292 else if constexpr (__y_to_z)
293 return __intrin_bitcast<_To>(is_signed_v<_Tp>
294 ? _mm512_cvtepi32_epi64(__intrin)
295 : _mm512_cvtepu32_epi64(__intrin));
297 else if constexpr (__i32_to_i16)
299 if constexpr (__x_to_x && __have_avx512vl)
300 return __intrin_bitcast<_To>(_mm_cvtepi32_epi16(__intrin));
301 else if constexpr (__x_to_x && __have_avx512f)
302 return __intrin_bitcast<_To>(
303 __lo128(_mm512_cvtepi32_epi16(__auto_bitcast(__v))));
304 else if constexpr (__x_to_x && __have_ssse3)
305 return __intrin_bitcast<_To>(_mm_shuffle_epi8(
306 __intrin, _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, -0x80, -0x80,
307 -0x80, -0x80, -0x80, -0x80, -0x80, -0x80)));
308 else if constexpr (__x_to_x)
310 auto __a = _mm_unpacklo_epi16(__intrin, __m128i());
311 auto __b = _mm_unpackhi_epi16(__intrin, __m128i());
312 auto __c = _mm_unpacklo_epi16(__a, __b);
313 auto __d = _mm_unpackhi_epi16(__a, __b);
314 return __intrin_bitcast<_To>(
315 _mm_unpacklo_epi16(__c, __d));
317 else if constexpr (__y_to_x && __have_avx512vl)
318 return __intrin_bitcast<_To>(_mm256_cvtepi32_epi16(__intrin));
319 else if constexpr (__y_to_x && __have_avx512f)
320 return __intrin_bitcast<_To>(
321 __lo128(_mm512_cvtepi32_epi16(__auto_bitcast(__v))));
322 else if constexpr (__y_to_x)
324 auto __a = _mm256_shuffle_epi8(
326 _mm256_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, -0x80, -0x80, -0x80,
327 -0x80, -0x80, -0x80, -0x80, -0x80, 0, 1, 4, 5, 8,
328 9, 12, 13, -0x80, -0x80, -0x80, -0x80, -0x80,
329 -0x80, -0x80, -0x80));
330 return __intrin_bitcast<_To>(__lo128(
331 _mm256_permute4x64_epi64(__a,
334 else if constexpr (__z_to_y)
335 return __intrin_bitcast<_To>(_mm512_cvtepi32_epi16(__intrin));
337 else if constexpr (__i32_to_i8)
339 if constexpr (__x_to_x && __have_avx512vl)
340 return __intrin_bitcast<_To>(_mm_cvtepi32_epi8(__intrin));
341 else if constexpr (__x_to_x && __have_avx512f)
342 return __intrin_bitcast<_To>(
343 __lo128(_mm512_cvtepi32_epi8(__zero_extend(__intrin))));
344 else if constexpr (__x_to_x && __have_ssse3)
346 return __intrin_bitcast<_To>(
347 _mm_shuffle_epi8(__intrin,
348 _mm_setr_epi8(0, 4, 8, 12, -0x80, -0x80, -0x80,
349 -0x80, -0x80, -0x80, -0x80, -0x80,
350 -0x80, -0x80, -0x80, -0x80)));
352 else if constexpr (__x_to_x)
355 = _mm_unpacklo_epi8(__intrin, __intrin);
357 = _mm_unpackhi_epi8(__intrin, __intrin);
358 const auto __c = _mm_unpacklo_epi8(__a, __b);
359 const auto __d = _mm_unpackhi_epi8(__a, __b);
360 const auto __e = _mm_unpacklo_epi8(__c, __d);
361 return __intrin_bitcast<_To>(__e & _mm_cvtsi32_si128(-1));
363 else if constexpr (__y_to_x && __have_avx512vl)
364 return __intrin_bitcast<_To>(_mm256_cvtepi32_epi8(__intrin));
365 else if constexpr (__y_to_x && __have_avx512f)
366 return __intrin_bitcast<_To>(
367 _mm512_cvtepi32_epi8(__zero_extend(__intrin)));
368 else if constexpr (__z_to_x)
369 return __intrin_bitcast<_To>(_mm512_cvtepi32_epi8(__intrin));
371 else if constexpr (__i16_to_i64)
373 if constexpr (__x_to_x && __have_sse4_1)
374 return __intrin_bitcast<_To>(is_signed_v<_Tp>
375 ? _mm_cvtepi16_epi64(__intrin)
376 : _mm_cvtepu16_epi64(__intrin));
377 else if constexpr (__x_to_x && is_signed_v<_Tp>)
379 auto __x = _mm_srai_epi16(__intrin, 15);
380 auto __y = _mm_unpacklo_epi16(__intrin, __x);
381 __x = _mm_unpacklo_epi16(__x, __x);
382 return __intrin_bitcast<_To>(_mm_unpacklo_epi32(__y, __x));
384 else if constexpr (__x_to_x)
385 return __intrin_bitcast<_To>(
386 _mm_unpacklo_epi32(_mm_unpacklo_epi16(__intrin, __m128i()),
388 else if constexpr (__x_to_y)
389 return __intrin_bitcast<_To>(is_signed_v<_Tp>
390 ? _mm256_cvtepi16_epi64(__intrin)
391 : _mm256_cvtepu16_epi64(__intrin));
392 else if constexpr (__x_to_z)
393 return __intrin_bitcast<_To>(is_signed_v<_Tp>
394 ? _mm512_cvtepi16_epi64(__intrin)
395 : _mm512_cvtepu16_epi64(__intrin));
397 else if constexpr (__i16_to_i32)
399 if constexpr (__x_to_x && __have_sse4_1)
400 return __intrin_bitcast<_To>(is_signed_v<_Tp>
401 ? _mm_cvtepi16_epi32(__intrin)
402 : _mm_cvtepu16_epi32(__intrin));
403 else if constexpr (__x_to_x && is_signed_v<_Tp>)
404 return __intrin_bitcast<_To>(
405 _mm_srai_epi32(_mm_unpacklo_epi16(__intrin, __intrin), 16));
406 else if constexpr (__x_to_x && is_unsigned_v<_Tp>)
407 return __intrin_bitcast<_To>(_mm_unpacklo_epi16(__intrin, __m128i()));
408 else if constexpr (__x_to_y)
409 return __intrin_bitcast<_To>(is_signed_v<_Tp>
410 ? _mm256_cvtepi16_epi32(__intrin)
411 : _mm256_cvtepu16_epi32(__intrin));
412 else if constexpr (__y_to_z)
413 return __intrin_bitcast<_To>(is_signed_v<_Tp>
414 ? _mm512_cvtepi16_epi32(__intrin)
415 : _mm512_cvtepu16_epi32(__intrin));
417 else if constexpr (__i16_to_i8)
419 if constexpr (__x_to_x && __have_avx512bw_vl)
420 return __intrin_bitcast<_To>(_mm_cvtepi16_epi8(__intrin));
421 else if constexpr (__x_to_x && __have_avx512bw)
422 return __intrin_bitcast<_To>(
423 __lo128(_mm512_cvtepi16_epi8(__zero_extend(__intrin))));
424 else if constexpr (__x_to_x && __have_ssse3)
425 return __intrin_bitcast<_To>(_mm_shuffle_epi8(
426 __intrin, _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, -0x80, -0x80,
427 -0x80, -0x80, -0x80, -0x80, -0x80, -0x80)));
428 else if constexpr (__x_to_x)
431 = _mm_unpacklo_epi8(__intrin, __intrin);
433 = _mm_unpackhi_epi8(__intrin, __intrin);
434 auto __c = _mm_unpacklo_epi8(__a, __b);
435 auto __d = _mm_unpackhi_epi8(__a, __b);
436 auto __e = _mm_unpacklo_epi8(__c, __d);
437 auto __f = _mm_unpackhi_epi8(__c, __d);
438 return __intrin_bitcast<_To>(_mm_unpacklo_epi8(__e, __f));
440 else if constexpr (__y_to_x && __have_avx512bw_vl)
441 return __intrin_bitcast<_To>(_mm256_cvtepi16_epi8(__intrin));
442 else if constexpr (__y_to_x && __have_avx512bw)
443 return __intrin_bitcast<_To>(
444 __lo256(_mm512_cvtepi16_epi8(__zero_extend(__intrin))));
445 else if constexpr (__y_to_x)
447 auto __a = _mm256_shuffle_epi8(
449 _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, -0x80, -0x80, -0x80,
450 -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80,
451 -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, 0, 2,
452 4, 6, 8, 10, 12, 14));
453 return __intrin_bitcast<_To>(__lo128(__a) | __hi128(__a));
455 else if constexpr (__z_to_y && __have_avx512bw)
456 return __intrin_bitcast<_To>(_mm512_cvtepi16_epi8(__intrin));
457 else if constexpr (__z_to_y)
458 __assert_unreachable<_Tp>();
460 else if constexpr (__i8_to_i64)
462 if constexpr (__x_to_x && __have_sse4_1)
463 return __intrin_bitcast<_To>(is_signed_v<_Tp>
464 ? _mm_cvtepi8_epi64(__intrin)
465 : _mm_cvtepu8_epi64(__intrin));
466 else if constexpr (__x_to_x && is_signed_v<_Tp>)
468 if constexpr (__have_ssse3)
470 auto __dup = _mm_unpacklo_epi8(__intrin, __intrin);
471 auto __epi16 = _mm_srai_epi16(__dup, 8);
472 _mm_shuffle_epi8(__epi16,
473 _mm_setr_epi8(0, 1, 1, 1, 1, 1, 1, 1, 2, 3, 3,
478 auto __x = _mm_unpacklo_epi8(__intrin, __intrin);
479 __x = _mm_unpacklo_epi16(__x, __x);
480 return __intrin_bitcast<_To>(
481 _mm_unpacklo_epi32(_mm_srai_epi32(__x, 24),
482 _mm_srai_epi32(__x, 31)));
485 else if constexpr (__x_to_x)
487 return __intrin_bitcast<_To>(_mm_unpacklo_epi32(
488 _mm_unpacklo_epi16(_mm_unpacklo_epi8(__intrin, __m128i()),
492 else if constexpr (__x_to_y)
493 return __intrin_bitcast<_To>(is_signed_v<_Tp>
494 ? _mm256_cvtepi8_epi64(__intrin)
495 : _mm256_cvtepu8_epi64(__intrin));
496 else if constexpr (__x_to_z)
497 return __intrin_bitcast<_To>(is_signed_v<_Tp>
498 ? _mm512_cvtepi8_epi64(__intrin)
499 : _mm512_cvtepu8_epi64(__intrin));
501 else if constexpr (__i8_to_i32)
503 if constexpr (__x_to_x && __have_sse4_1)
504 return __intrin_bitcast<_To>(is_signed_v<_Tp>
505 ? _mm_cvtepi8_epi32(__intrin)
506 : _mm_cvtepu8_epi32(__intrin));
507 else if constexpr (__x_to_x && is_signed_v<_Tp>)
509 const auto __x = _mm_unpacklo_epi8(__intrin, __intrin);
510 return __intrin_bitcast<_To>(
511 _mm_srai_epi32(_mm_unpacklo_epi16(__x, __x), 24));
513 else if constexpr (__x_to_x && is_unsigned_v<_Tp>)
514 return __intrin_bitcast<_To>(
515 _mm_unpacklo_epi16(_mm_unpacklo_epi8(__intrin, __m128i()),
517 else if constexpr (__x_to_y)
518 return __intrin_bitcast<_To>(is_signed_v<_Tp>
519 ? _mm256_cvtepi8_epi32(__intrin)
520 : _mm256_cvtepu8_epi32(__intrin));
521 else if constexpr (__x_to_z)
522 return __intrin_bitcast<_To>(is_signed_v<_Tp>
523 ? _mm512_cvtepi8_epi32(__intrin)
524 : _mm512_cvtepu8_epi32(__intrin));
526 else if constexpr (__i8_to_i16)
528 if constexpr (__x_to_x && __have_sse4_1)
529 return __intrin_bitcast<_To>(is_signed_v<_Tp>
530 ? _mm_cvtepi8_epi16(__intrin)
531 : _mm_cvtepu8_epi16(__intrin));
532 else if constexpr (__x_to_x && is_signed_v<_Tp>)
533 return __intrin_bitcast<_To>(
534 _mm_srai_epi16(_mm_unpacklo_epi8(__intrin, __intrin), 8));
535 else if constexpr (__x_to_x && is_unsigned_v<_Tp>)
536 return __intrin_bitcast<_To>(_mm_unpacklo_epi8(__intrin, __m128i()));
537 else if constexpr (__x_to_y)
538 return __intrin_bitcast<_To>(is_signed_v<_Tp>
539 ? _mm256_cvtepi8_epi16(__intrin)
540 : _mm256_cvtepu8_epi16(__intrin));
541 else if constexpr (__y_to_z && __have_avx512bw)
542 return __intrin_bitcast<_To>(is_signed_v<_Tp>
543 ? _mm512_cvtepi8_epi16(__intrin)
544 : _mm512_cvtepu8_epi16(__intrin));
545 else if constexpr (__y_to_z)
546 __assert_unreachable<_Tp>();
548 else if constexpr (__f32_to_s64)
550 if constexpr (__have_avx512dq_vl && __x_to_x)
551 return __intrin_bitcast<_To>(_mm_cvttps_epi64(__intrin));
552 else if constexpr (__have_avx512dq_vl && __x_to_y)
553 return __intrin_bitcast<_To>(_mm256_cvttps_epi64(__intrin));
554 else if constexpr (__have_avx512dq && __y_to_z)
555 return __intrin_bitcast<_To>(_mm512_cvttps_epi64(__intrin));
558 else if constexpr (__f32_to_u64)
560 if constexpr (__have_avx512dq_vl && __x_to_x)
561 return __intrin_bitcast<_To>(_mm_cvttps_epu64(__intrin));
562 else if constexpr (__have_avx512dq_vl && __x_to_y)
563 return __intrin_bitcast<_To>(_mm256_cvttps_epu64(__intrin));
564 else if constexpr (__have_avx512dq && __y_to_z)
565 return __intrin_bitcast<_To>(_mm512_cvttps_epu64(__intrin));
568 else if constexpr (__f32_to_s32)
570 if constexpr (__x_to_x || __y_to_y || __z_to_z)
575 __assert_unreachable<_Tp>();
577 else if constexpr (__f32_to_u32)
579 if constexpr (__have_avx512vl && __x_to_x)
580 return __auto_bitcast(_mm_cvttps_epu32(__intrin));
581 else if constexpr (__have_avx512f && __x_to_x)
582 return __auto_bitcast(
583 __lo128(_mm512_cvttps_epu32(__auto_bitcast(__v))));
584 else if constexpr (__have_avx512vl && __y_to_y)
585 return __vector_bitcast<_Up>(_mm256_cvttps_epu32(__intrin));
586 else if constexpr (__have_avx512f && __y_to_y)
587 return __vector_bitcast<_Up>(
588 __lo256(_mm512_cvttps_epu32(__auto_bitcast(__v))));
589 else if constexpr (__x_to_x || __y_to_y || __z_to_z)
596 __assert_unreachable<_Tp>();
598 else if constexpr (__f32_to_ibw)
599 return __convert_x86<_To>(__convert_x86<__vector_type_t<int, _Np>>(__v));
600 else if constexpr (__f64_to_s64)
602 if constexpr (__have_avx512dq_vl && __x_to_x)
603 return __intrin_bitcast<_To>(_mm_cvttpd_epi64(__intrin));
604 else if constexpr (__have_avx512dq_vl && __y_to_y)
605 return __intrin_bitcast<_To>(_mm256_cvttpd_epi64(__intrin));
606 else if constexpr (__have_avx512dq && __z_to_z)
607 return __intrin_bitcast<_To>(_mm512_cvttpd_epi64(__intrin));
610 else if constexpr (__f64_to_u64)
612 if constexpr (__have_avx512dq_vl && __x_to_x)
613 return __intrin_bitcast<_To>(_mm_cvttpd_epu64(__intrin));
614 else if constexpr (__have_avx512dq_vl && __y_to_y)
615 return __intrin_bitcast<_To>(_mm256_cvttpd_epu64(__intrin));
616 else if constexpr (__have_avx512dq && __z_to_z)
617 return __intrin_bitcast<_To>(_mm512_cvttpd_epu64(__intrin));
620 else if constexpr (__f64_to_s32)
622 if constexpr (__x_to_x)
623 return __intrin_bitcast<_To>(_mm_cvttpd_epi32(__intrin));
624 else if constexpr (__y_to_x)
625 return __intrin_bitcast<_To>(_mm256_cvttpd_epi32(__intrin));
626 else if constexpr (__z_to_y)
627 return __intrin_bitcast<_To>(_mm512_cvttpd_epi32(__intrin));
629 else if constexpr (__f64_to_u32)
631 if constexpr (__have_avx512vl && __x_to_x)
632 return __intrin_bitcast<_To>(_mm_cvttpd_epu32(__intrin));
633 else if constexpr (__have_sse4_1 && __x_to_x)
634 return __vector_bitcast<_Up, _M>(
635 _mm_cvttpd_epi32(_mm_floor_pd(__intrin) - 0x8000'0000u))
637 else if constexpr (__x_to_x)
642 else if constexpr (__have_avx512vl && __y_to_x)
643 return __intrin_bitcast<_To>(_mm256_cvttpd_epu32(__intrin));
644 else if constexpr (__y_to_x)
646 return __intrin_bitcast<_To>(
647 __vector_bitcast<_Up>(
648 _mm256_cvttpd_epi32(_mm256_floor_pd(__intrin) - 0x8000'0000u))
651 else if constexpr (__z_to_y)
652 return __intrin_bitcast<_To>(_mm512_cvttpd_epu32(__intrin));
654 else if constexpr (__f64_to_ibw)
656 return __convert_x86<_To>(
657 __convert_x86<__vector_type_t<
int, (_Np < 4 ? 4 : _Np)>>(__v));
659 else if constexpr (__s64_to_f32)
661 if constexpr (__x_to_x && __have_avx512dq_vl)
662 return __intrin_bitcast<_To>(_mm_cvtepi64_ps(__intrin));
663 else if constexpr (__y_to_x && __have_avx512dq_vl)
664 return __intrin_bitcast<_To>(_mm256_cvtepi64_ps(__intrin));
665 else if constexpr (__z_to_y && __have_avx512dq)
666 return __intrin_bitcast<_To>(_mm512_cvtepi64_ps(__intrin));
667 else if constexpr (__z_to_y)
668 return __intrin_bitcast<_To>(
669 _mm512_cvtpd_ps(__convert_x86<__vector_type_t<double, 8>>(__v)));
671 else if constexpr (__u64_to_f32)
673 if constexpr (__x_to_x && __have_avx512dq_vl)
674 return __intrin_bitcast<_To>(_mm_cvtepu64_ps(__intrin));
675 else if constexpr (__y_to_x && __have_avx512dq_vl)
676 return __intrin_bitcast<_To>(_mm256_cvtepu64_ps(__intrin));
677 else if constexpr (__z_to_y && __have_avx512dq)
678 return __intrin_bitcast<_To>(_mm512_cvtepu64_ps(__intrin));
679 else if constexpr (__z_to_y)
681 return __intrin_bitcast<_To>(
682 __lo256(_mm512_cvtepu32_ps(__auto_bitcast(
683 _mm512_cvtepi64_epi32(_mm512_srai_epi64(__intrin, 32)))))
685 + __lo256(_mm512_cvtepu32_ps(
686 __auto_bitcast(_mm512_cvtepi64_epi32(__intrin)))));
689 else if constexpr (__s32_to_f32)
693 else if constexpr (__u32_to_f32)
695 if constexpr (__x_to_x && __have_avx512vl)
699 else if constexpr (__x_to_x && __have_avx512f)
700 return __intrin_bitcast<_To>(
701 __lo128(_mm512_cvtepu32_ps(__auto_bitcast(__v))));
702 else if constexpr (__x_to_x && (__have_fma || __have_fma4))
704 return __auto_bitcast(0x10000
705 * _mm_cvtepi32_ps(__to_intrin(__v >> 16))
706 + _mm_cvtepi32_ps(__to_intrin(__v & 0xffff)));
707 else if constexpr (__y_to_y && __have_avx512vl)
711 else if constexpr (__y_to_y && __have_avx512f)
712 return __intrin_bitcast<_To>(
713 __lo256(_mm512_cvtepu32_ps(__auto_bitcast(__v))));
714 else if constexpr (__y_to_y)
716 return 0x10000 * _mm256_cvtepi32_ps(__to_intrin(__v >> 16))
717 + _mm256_cvtepi32_ps(__to_intrin(__v & 0xffff));
720 else if constexpr (__ibw_to_f32)
722 if constexpr (_M <= 4 || __have_avx2)
723 return __convert_x86<_To>(
724 __convert_x86<__vector_type_t<int, _M>>(__v));
727 static_assert(__x_to_y);
729 if constexpr (__have_sse4_1)
731 __a =
sizeof(_Tp) == 2
732 ? (is_signed_v<_Tp> ? _mm_cvtepi16_epi32(__intrin)
733 : _mm_cvtepu16_epi32(__intrin))
734 : (is_signed_v<_Tp> ? _mm_cvtepi8_epi32(__intrin)
735 : _mm_cvtepu8_epi32(__intrin));
737 = _mm_shuffle_epi32(__intrin,
sizeof(_Tp) == 2 ? 0xee : 0xe9);
738 __b =
sizeof(_Tp) == 2
739 ? (is_signed_v<_Tp> ? _mm_cvtepi16_epi32(__w)
740 : _mm_cvtepu16_epi32(__w))
741 : (is_signed_v<_Tp> ? _mm_cvtepi8_epi32(__w)
742 : _mm_cvtepu8_epi32(__w));
747 if constexpr (
sizeof(_Tp) == 1)
749 __tmp = is_signed_v<_Tp>
750 ? _mm_srai_epi16(_mm_unpacklo_epi8(__intrin,
753 : _mm_unpacklo_epi8(__intrin, __m128i());
757 static_assert(
sizeof(_Tp) == 2);
760 __a = is_signed_v<_Tp>
761 ? _mm_srai_epi32(_mm_unpacklo_epi16(__tmp, __tmp), 16)
762 : _mm_unpacklo_epi16(__tmp, __m128i());
763 __b = is_signed_v<_Tp>
764 ? _mm_srai_epi32(_mm_unpackhi_epi16(__tmp, __tmp), 16)
765 : _mm_unpackhi_epi16(__tmp, __m128i());
767 return __convert_x86<_To>(__vector_bitcast<int>(__a),
768 __vector_bitcast<int>(__b));
771 else if constexpr (__s64_to_f64)
773 if constexpr (__x_to_x && __have_avx512dq_vl)
774 return __intrin_bitcast<_To>(_mm_cvtepi64_pd(__intrin));
775 else if constexpr (__y_to_y && __have_avx512dq_vl)
776 return __intrin_bitcast<_To>(_mm256_cvtepi64_pd(__intrin));
777 else if constexpr (__z_to_z && __have_avx512dq)
778 return __intrin_bitcast<_To>(_mm512_cvtepi64_pd(__intrin));
779 else if constexpr (__z_to_z)
781 return __intrin_bitcast<_To>(
782 _mm512_cvtepi32_pd(_mm512_cvtepi64_epi32(__to_intrin(__v >> 32)))
784 + _mm512_cvtepu32_pd(_mm512_cvtepi64_epi32(__intrin)));
787 else if constexpr (__u64_to_f64)
789 if constexpr (__x_to_x && __have_avx512dq_vl)
790 return __intrin_bitcast<_To>(_mm_cvtepu64_pd(__intrin));
791 else if constexpr (__y_to_y && __have_avx512dq_vl)
792 return __intrin_bitcast<_To>(_mm256_cvtepu64_pd(__intrin));
793 else if constexpr (__z_to_z && __have_avx512dq)
794 return __intrin_bitcast<_To>(_mm512_cvtepu64_pd(__intrin));
795 else if constexpr (__z_to_z)
797 return __intrin_bitcast<_To>(
798 _mm512_cvtepu32_pd(_mm512_cvtepi64_epi32(__to_intrin(__v >> 32)))
800 + _mm512_cvtepu32_pd(_mm512_cvtepi64_epi32(__intrin)));
803 else if constexpr (__s32_to_f64)
805 if constexpr (__x_to_x)
806 return __intrin_bitcast<_To>(_mm_cvtepi32_pd(__intrin));
807 else if constexpr (__x_to_y)
808 return __intrin_bitcast<_To>(_mm256_cvtepi32_pd(__intrin));
809 else if constexpr (__y_to_z)
810 return __intrin_bitcast<_To>(_mm512_cvtepi32_pd(__intrin));
812 else if constexpr (__u32_to_f64)
814 if constexpr (__x_to_x && __have_avx512vl)
815 return __intrin_bitcast<_To>(_mm_cvtepu32_pd(__intrin));
816 else if constexpr (__x_to_x && __have_avx512f)
817 return __intrin_bitcast<_To>(
818 __lo128(_mm512_cvtepu32_pd(__auto_bitcast(__v))));
819 else if constexpr (__x_to_x)
820 return __intrin_bitcast<_To>(
821 _mm_cvtepi32_pd(__to_intrin(__v ^ 0x8000'0000u)) + 0x8000'0000u);
822 else if constexpr (__x_to_y && __have_avx512vl)
823 return __intrin_bitcast<_To>(_mm256_cvtepu32_pd(__intrin));
824 else if constexpr (__x_to_y && __have_avx512f)
825 return __intrin_bitcast<_To>(
826 __lo256(_mm512_cvtepu32_pd(__auto_bitcast(__v))));
827 else if constexpr (__x_to_y)
828 return __intrin_bitcast<_To>(
829 _mm256_cvtepi32_pd(__to_intrin(__v ^ 0x8000'0000u)) + 0x8000'0000u);
830 else if constexpr (__y_to_z)
831 return __intrin_bitcast<_To>(_mm512_cvtepu32_pd(__intrin));
833 else if constexpr (__ibw_to_f64)
835 return __convert_x86<_To>(
836 __convert_x86<__vector_type_t<
int,
std::max(
size_t(4), _M)>>(__v));
838 else if constexpr (__f32_to_f64)
840 if constexpr (__x_to_x)
841 return __intrin_bitcast<_To>(_mm_cvtps_pd(__intrin));
842 else if constexpr (__x_to_y)
843 return __intrin_bitcast<_To>(_mm256_cvtps_pd(__intrin));
844 else if constexpr (__y_to_z)
845 return __intrin_bitcast<_To>(_mm512_cvtps_pd(__intrin));
847 else if constexpr (__f64_to_f32)
849 if constexpr (__x_to_x)
850 return __intrin_bitcast<_To>(_mm_cvtpd_ps(__intrin));
851 else if constexpr (__y_to_x)
852 return __intrin_bitcast<_To>(_mm256_cvtpd_ps(__intrin));
853 else if constexpr (__z_to_y)
854 return __intrin_bitcast<_To>(_mm512_cvtpd_ps(__intrin));
857 __assert_unreachable<_Tp>();
860 return __vector_convert<_To>(__v, make_index_sequence<
std::min(_M, _Np)>());
866template <
typename _To,
typename _V,
typename _Traits>
867 _GLIBCXX_SIMD_INTRINSIC _To
868 __convert_x86(_V __v0, _V __v1)
870 static_assert(__is_vector_type_v<_V>);
871 using _Tp =
typename _Traits::value_type;
872 constexpr size_t _Np = _Traits::_S_full_size;
873 [[maybe_unused]]
const auto __i0 = __to_intrin(__v0);
874 [[maybe_unused]]
const auto __i1 = __to_intrin(__v1);
875 using _Up =
typename _VectorTraits<_To>::value_type;
876 constexpr size_t _M = _VectorTraits<_To>::_S_full_size;
878 static_assert(2 * _Np <= _M,
879 "__v1 would be discarded; use the one-argument "
880 "__convert_x86 overload instead");
883 [[maybe_unused]]
constexpr bool __x_to_x
884 =
sizeof(__v0) <= 16 &&
sizeof(_To) <= 16;
885 [[maybe_unused]]
constexpr bool __x_to_y
886 =
sizeof(__v0) <= 16 &&
sizeof(_To) == 32;
887 [[maybe_unused]]
constexpr bool __x_to_z
888 =
sizeof(__v0) <= 16 &&
sizeof(_To) == 64;
889 [[maybe_unused]]
constexpr bool __y_to_x
890 =
sizeof(__v0) == 32 &&
sizeof(_To) <= 16;
891 [[maybe_unused]]
constexpr bool __y_to_y
892 =
sizeof(__v0) == 32 &&
sizeof(_To) == 32;
893 [[maybe_unused]]
constexpr bool __y_to_z
894 =
sizeof(__v0) == 32 &&
sizeof(_To) == 64;
895 [[maybe_unused]]
constexpr bool __z_to_x
896 =
sizeof(__v0) == 64 &&
sizeof(_To) <= 16;
897 [[maybe_unused]]
constexpr bool __z_to_y
898 =
sizeof(__v0) == 64 &&
sizeof(_To) == 32;
899 [[maybe_unused]]
constexpr bool __z_to_z
900 =
sizeof(__v0) == 64 &&
sizeof(_To) == 64;
903 [[maybe_unused]]
constexpr bool __i_to_i
904 = is_integral_v<_Up> && is_integral_v<_Tp>;
905 [[maybe_unused]]
constexpr bool __i8_to_i16
906 = __i_to_i &&
sizeof(_Tp) == 1 &&
sizeof(_Up) == 2;
907 [[maybe_unused]]
constexpr bool __i8_to_i32
908 = __i_to_i &&
sizeof(_Tp) == 1 &&
sizeof(_Up) == 4;
909 [[maybe_unused]]
constexpr bool __i8_to_i64
910 = __i_to_i &&
sizeof(_Tp) == 1 &&
sizeof(_Up) == 8;
911 [[maybe_unused]]
constexpr bool __i16_to_i8
912 = __i_to_i &&
sizeof(_Tp) == 2 &&
sizeof(_Up) == 1;
913 [[maybe_unused]]
constexpr bool __i16_to_i32
914 = __i_to_i &&
sizeof(_Tp) == 2 &&
sizeof(_Up) == 4;
915 [[maybe_unused]]
constexpr bool __i16_to_i64
916 = __i_to_i &&
sizeof(_Tp) == 2 &&
sizeof(_Up) == 8;
917 [[maybe_unused]]
constexpr bool __i32_to_i8
918 = __i_to_i &&
sizeof(_Tp) == 4 &&
sizeof(_Up) == 1;
919 [[maybe_unused]]
constexpr bool __i32_to_i16
920 = __i_to_i &&
sizeof(_Tp) == 4 &&
sizeof(_Up) == 2;
921 [[maybe_unused]]
constexpr bool __i32_to_i64
922 = __i_to_i &&
sizeof(_Tp) == 4 &&
sizeof(_Up) == 8;
923 [[maybe_unused]]
constexpr bool __i64_to_i8
924 = __i_to_i &&
sizeof(_Tp) == 8 &&
sizeof(_Up) == 1;
925 [[maybe_unused]]
constexpr bool __i64_to_i16
926 = __i_to_i &&
sizeof(_Tp) == 8 &&
sizeof(_Up) == 2;
927 [[maybe_unused]]
constexpr bool __i64_to_i32
928 = __i_to_i &&
sizeof(_Tp) == 8 &&
sizeof(_Up) == 4;
932 [[maybe_unused]]
constexpr bool __i64_to_f32
933 = is_integral_v<_Tp> &&
sizeof(_Tp) == 8
934 && is_floating_point_v<_Up> &&
sizeof(_Up) == 4;
935 [[maybe_unused]]
constexpr bool __s32_to_f32
936 = is_integral_v<_Tp> && is_signed_v<_Tp> &&
sizeof(_Tp) == 4
937 && is_floating_point_v<_Up> &&
sizeof(_Up) == 4;
938 [[maybe_unused]]
constexpr bool __s16_to_f32
939 = is_integral_v<_Tp> && is_signed_v<_Tp> &&
sizeof(_Tp) == 2
940 && is_floating_point_v<_Up> &&
sizeof(_Up) == 4;
941 [[maybe_unused]]
constexpr bool __s8_to_f32
942 = is_integral_v<_Tp> && is_signed_v<_Tp> &&
sizeof(_Tp) == 1
943 && is_floating_point_v<_Up> &&
sizeof(_Up) == 4;
944 [[maybe_unused]]
constexpr bool __u32_to_f32
945 = is_integral_v<_Tp> && is_unsigned_v<_Tp> &&
sizeof(_Tp) == 4
946 && is_floating_point_v<_Up> &&
sizeof(_Up) == 4;
947 [[maybe_unused]]
constexpr bool __u16_to_f32
948 = is_integral_v<_Tp> && is_unsigned_v<_Tp> &&
sizeof(_Tp) == 2
949 && is_floating_point_v<_Up> &&
sizeof(_Up) == 4;
950 [[maybe_unused]]
constexpr bool __u8_to_f32
951 = is_integral_v<_Tp> && is_unsigned_v<_Tp> &&
sizeof(_Tp) == 1
952 && is_floating_point_v<_Up> &&
sizeof(_Up) == 4;
953 [[maybe_unused]]
constexpr bool __s64_to_f64
954 = is_integral_v<_Tp> && is_signed_v<_Tp> &&
sizeof(_Tp) == 8
955 && is_floating_point_v<_Up> &&
sizeof(_Up) == 8;
956 [[maybe_unused]]
constexpr bool __s32_to_f64
957 = is_integral_v<_Tp> && is_signed_v<_Tp> &&
sizeof(_Tp) == 4
958 && is_floating_point_v<_Up> &&
sizeof(_Up) == 8;
959 [[maybe_unused]]
constexpr bool __s16_to_f64
960 = is_integral_v<_Tp> && is_signed_v<_Tp> &&
sizeof(_Tp) == 2
961 && is_floating_point_v<_Up> &&
sizeof(_Up) == 8;
962 [[maybe_unused]]
constexpr bool __s8_to_f64
963 = is_integral_v<_Tp> && is_signed_v<_Tp> &&
sizeof(_Tp) == 1
964 && is_floating_point_v<_Up> &&
sizeof(_Up) == 8;
965 [[maybe_unused]]
constexpr bool __u64_to_f64
966 = is_integral_v<_Tp> && is_unsigned_v<_Tp> &&
sizeof(_Tp) == 8
967 && is_floating_point_v<_Up> &&
sizeof(_Up) == 8;
968 [[maybe_unused]]
constexpr bool __u32_to_f64
969 = is_integral_v<_Tp> && is_unsigned_v<_Tp> &&
sizeof(_Tp) == 4
970 && is_floating_point_v<_Up> &&
sizeof(_Up) == 8;
971 [[maybe_unused]]
constexpr bool __u16_to_f64
972 = is_integral_v<_Tp> && is_unsigned_v<_Tp> &&
sizeof(_Tp) == 2
973 && is_floating_point_v<_Up> &&
sizeof(_Up) == 8;
974 [[maybe_unused]]
constexpr bool __u8_to_f64
975 = is_integral_v<_Tp> && is_unsigned_v<_Tp> &&
sizeof(_Tp) == 1
976 && is_floating_point_v<_Up> &&
sizeof(_Up) == 8;
977 [[maybe_unused]]
constexpr bool __f32_to_s64
978 = is_integral_v<_Up> && is_signed_v<_Up> &&
sizeof(_Up) == 8
979 && is_floating_point_v<_Tp> &&
sizeof(_Tp) == 4;
980 [[maybe_unused]]
constexpr bool __f32_to_s32
981 = is_integral_v<_Up> && is_signed_v<_Up> &&
sizeof(_Up) == 4
982 && is_floating_point_v<_Tp> &&
sizeof(_Tp) == 4;
983 [[maybe_unused]]
constexpr bool __f32_to_u64
984 = is_integral_v<_Up> && is_unsigned_v<_Up> &&
sizeof(_Up) == 8
985 && is_floating_point_v<_Tp> &&
sizeof(_Tp) == 4;
986 [[maybe_unused]]
constexpr bool __f32_to_u32
987 = is_integral_v<_Up> && is_unsigned_v<_Up> &&
sizeof(_Up) == 4
988 && is_floating_point_v<_Tp> &&
sizeof(_Tp) == 4;
989 [[maybe_unused]]
constexpr bool __f64_to_s64
990 = is_integral_v<_Up> && is_signed_v<_Up> &&
sizeof(_Up) == 8
991 && is_floating_point_v<_Tp> &&
sizeof(_Tp) == 8;
992 [[maybe_unused]]
constexpr bool __f64_to_s32
993 = is_integral_v<_Up> && is_signed_v<_Up> &&
sizeof(_Up) == 4
994 && is_floating_point_v<_Tp> &&
sizeof(_Tp) == 8;
995 [[maybe_unused]]
constexpr bool __f64_to_u64
996 = is_integral_v<_Up> && is_unsigned_v<_Up> &&
sizeof(_Up) == 8
997 && is_floating_point_v<_Tp> &&
sizeof(_Tp) == 8;
998 [[maybe_unused]]
constexpr bool __f64_to_u32
999 = is_integral_v<_Up> && is_unsigned_v<_Up> &&
sizeof(_Up) == 4
1000 && is_floating_point_v<_Tp> &&
sizeof(_Tp) == 8;
1001 [[maybe_unused]]
constexpr bool __f32_to_ibw
1002 = is_integral_v<_Up> &&
sizeof(_Up) <= 2
1003 && is_floating_point_v<_Tp> &&
sizeof(_Tp) == 4;
1004 [[maybe_unused]]
constexpr bool __f64_to_ibw
1005 = is_integral_v<_Up> &&
sizeof(_Up) <= 2
1006 && is_floating_point_v<_Tp> &&
sizeof(_Tp) == 8;
1007 [[maybe_unused]]
constexpr bool __f32_to_f64
1008 = is_floating_point_v<_Tp> &&
sizeof(_Tp) == 4
1009 && is_floating_point_v<_Up> &&
sizeof(_Up) == 8;
1010 [[maybe_unused]]
constexpr bool __f64_to_f32
1011 = is_floating_point_v<_Tp> &&
sizeof(_Tp) == 8
1012 && is_floating_point_v<_Up> &&
sizeof(_Up) == 4;
1014 if constexpr (__i_to_i && __y_to_x && !__have_avx2)
1016 return __convert_x86<_To>(__lo128(__v0), __hi128(__v0), __lo128(__v1),
1018 else if constexpr (__i_to_i)
1020 static_assert(__x_to_x || __have_avx2,
1021 "integral conversions with ymm registers require AVX2");
1022 static_assert(__have_avx512bw
1023 || ((
sizeof(_Tp) >= 4 ||
sizeof(__v0) < 64)
1024 && (
sizeof(_Up) >= 4 ||
sizeof(_To) < 64)),
1025 "8/16-bit integers in zmm registers require AVX512BW");
1026 static_assert((
sizeof(__v0) < 64 &&
sizeof(_To) < 64) || __have_avx512f,
1027 "integral conversions with ymm registers require AVX2");
1030 if constexpr (
sizeof(__v0) < 16 || (
sizeof(__v0) == 16 && __have_avx2)
1031 || (
sizeof(__v0) == 16 && __have_avx
1032 && is_floating_point_v<_Tp>)
1033 || (
sizeof(__v0) == 32 && __have_avx512f
1034 && (
sizeof(_Tp) >= 4 || __have_avx512bw)))
1038 return __convert_x86<_To>(__concat(__v0, __v1));
1045 !(is_floating_point_v<
1046 _Tp> == is_floating_point_v<_Up> &&
sizeof(_Tp) ==
sizeof(_Up)));
1048 if constexpr (2 * _Np < _M &&
sizeof(_To) > 16)
1050 constexpr size_t Min = 16 /
sizeof(_Up);
1051 return __zero_extend(
1053 __vector_type_t<_Up, (Min > 2 * _Np) ? Min : 2 * _Np>>(__v0,
1056 else if constexpr (__i64_to_i32)
1058 if constexpr (__x_to_x)
1059 return __auto_bitcast(_mm_shuffle_ps(__auto_bitcast(__v0),
1060 __auto_bitcast(__v1), 0x88));
1061 else if constexpr (__y_to_y)
1064 return __auto_bitcast(
1065 __xzyw(_mm256_shuffle_ps(__auto_bitcast(__v0),
1066 __auto_bitcast(__v1), 0x88)));
1074 else if constexpr (__z_to_z)
1075 return __intrin_bitcast<_To>(
1076 __concat(_mm512_cvtepi64_epi32(__i0),
1077 _mm512_cvtepi64_epi32(__i1)));
1079 else if constexpr (__i64_to_i16)
1081 if constexpr (__x_to_x)
1084 if constexpr (__have_sse4_1)
1086 return __intrin_bitcast<_To>(_mm_shuffle_epi8(
1087 _mm_blend_epi16(__i0, _mm_slli_si128(__i1, 4), 0x44),
1088 _mm_setr_epi8(0, 1, 8, 9, 4, 5, 12, 13, -0x80, -0x80,
1089 -0x80, -0x80, -0x80, -0x80, -0x80, -0x80)));
1093 return __vector_type_t<_Up, _M>{_Up(__v0[0]), _Up(__v0[1]),
1094 _Up(__v1[0]), _Up(__v1[1])};
1097 else if constexpr (__y_to_x)
1100 = _mm256_unpacklo_epi16(__i0, __i1);
1102 = _mm256_unpackhi_epi16(__i0, __i1);
1104 = _mm256_unpacklo_epi16(__a, __b);
1105 return __intrin_bitcast<_To>(
1106 _mm_unpacklo_epi32(__lo128(__c), __hi128(__c)));
1108 else if constexpr (__z_to_y)
1109 return __intrin_bitcast<_To>(
1110 __concat(_mm512_cvtepi64_epi16(__i0),
1111 _mm512_cvtepi64_epi16(__i1)));
1113 else if constexpr (__i64_to_i8)
1115 if constexpr (__x_to_x && __have_sse4_1)
1117 return __intrin_bitcast<_To>(_mm_shuffle_epi8(
1118 _mm_blend_epi16(__i0, _mm_slli_si128(__i1, 4), 0x44),
1119 _mm_setr_epi8(0, 8, 4, 12, -0x80, -0x80, -0x80, -0x80, -0x80,
1120 -0x80, -0x80, -0x80, -0x80, -0x80, -0x80,
1123 else if constexpr (__x_to_x && __have_ssse3)
1125 return __intrin_bitcast<_To>(_mm_unpacklo_epi16(
1127 __i0, _mm_setr_epi8(0, 8, -0x80, -0x80, -0x80, -0x80, -0x80,
1128 -0x80, -0x80, -0x80, -0x80, -0x80,
1129 -0x80, -0x80, -0x80, -0x80)),
1131 __i1, _mm_setr_epi8(0, 8, -0x80, -0x80, -0x80, -0x80, -0x80,
1132 -0x80, -0x80, -0x80, -0x80, -0x80,
1133 -0x80, -0x80, -0x80, -0x80))));
1135 else if constexpr (__x_to_x)
1137 return __vector_type_t<_Up, _M>{_Up(__v0[0]), _Up(__v0[1]),
1138 _Up(__v1[0]), _Up(__v1[1])};
1140 else if constexpr (__y_to_x)
1142 const auto __a = _mm256_shuffle_epi8(
1143 _mm256_blend_epi32(__i0, _mm256_slli_epi64(__i1, 32), 0xAA),
1144 _mm256_setr_epi8(0, 8, -0x80, -0x80, 4, 12, -0x80, -0x80,
1145 -0x80, -0x80, -0x80, -0x80, -0x80, -0x80,
1146 -0x80, -0x80, -0x80, -0x80, 0, 8, -0x80,
1147 -0x80, 4, 12, -0x80, -0x80, -0x80, -0x80,
1148 -0x80, -0x80, -0x80, -0x80));
1149 return __intrin_bitcast<_To>(__lo128(__a) | __hi128(__a));
1152 else if constexpr (__i32_to_i16)
1154 if constexpr (__x_to_x)
1157 if constexpr (__have_sse4_1)
1159 return __intrin_bitcast<_To>(_mm_shuffle_epi8(
1160 _mm_blend_epi16(__i0, _mm_slli_si128(__i1, 2), 0xaa),
1161 _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10,
1164 else if constexpr (__have_ssse3)
1166 return __intrin_bitcast<_To>(
1167 _mm_hadd_epi16(__to_intrin(__v0 << 16),
1168 __to_intrin(__v1 << 16)));
1179 auto __a = _mm_unpacklo_epi16(__i0, __i1);
1180 auto __b = _mm_unpackhi_epi16(__i0, __i1);
1181 auto __c = _mm_unpacklo_epi16(__a, __b);
1182 auto __d = _mm_unpackhi_epi16(__a, __b);
1183 return __intrin_bitcast<_To>(
1184 _mm_unpacklo_epi16(__c, __d));
1187 else if constexpr (__y_to_y)
1190 = _mm256_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, -0x80, -0x80,
1191 -0x80, -0x80, -0x80, -0x80, -0x80, -0x80,
1192 0, 1, 4, 5, 8, 9, 12, 13, -0x80, -0x80,
1193 -0x80, -0x80, -0x80, -0x80, -0x80, -0x80);
1194 auto __a = _mm256_shuffle_epi8(__i0, __shuf);
1195 auto __b = _mm256_shuffle_epi8(__i1, __shuf);
1196 return __intrin_bitcast<_To>(
1197 __xzyw(_mm256_unpacklo_epi64(__a, __b)));
1200 else if constexpr (__i32_to_i8)
1202 if constexpr (__x_to_x && __have_ssse3)
1205 = _mm_setr_epi8(0, 4, 8, 12, -0x80, -0x80, -0x80, -0x80,
1206 -0x80, -0x80, -0x80, -0x80, -0x80, -0x80,
1208 return __intrin_bitcast<_To>(
1209 _mm_unpacklo_epi32(_mm_shuffle_epi8(__i0, shufmask),
1210 _mm_shuffle_epi8(__i1, shufmask)));
1212 else if constexpr (__x_to_x)
1214 auto __a = _mm_unpacklo_epi8(__i0, __i1);
1215 auto __b = _mm_unpackhi_epi8(__i0, __i1);
1216 auto __c = _mm_unpacklo_epi8(__a, __b);
1217 auto __d = _mm_unpackhi_epi8(__a, __b);
1218 auto __e = _mm_unpacklo_epi8(__c, __d);
1219 return __intrin_bitcast<_To>(__e & __m128i{-1, 0});
1221 else if constexpr (__y_to_x)
1223 const auto __a = _mm256_shuffle_epi8(
1224 _mm256_blend_epi16(__i0, _mm256_slli_epi32(__i1, 16), 0xAA),
1225 _mm256_setr_epi8(0, 4, 8, 12, -0x80, -0x80, -0x80, -0x80, 2,
1226 6, 10, 14, -0x80, -0x80, -0x80, -0x80, -0x80,
1227 -0x80, -0x80, -0x80, 0, 4, 8, 12, -0x80,
1228 -0x80, -0x80, -0x80, 2, 6, 10, 14));
1229 return __intrin_bitcast<_To>(__lo128(__a) | __hi128(__a));
1232 else if constexpr (__i16_to_i8)
1234 if constexpr (__x_to_x && __have_ssse3)
1236 const auto __shuf =
reinterpret_cast<__m128i
>(
1237 __vector_type_t<_UChar, 16>{0, 2, 4, 6, 8, 10, 12, 14, 0x80,
1238 0x80, 0x80, 0x80, 0x80, 0x80,
1240 return __intrin_bitcast<_To>(
1241 _mm_unpacklo_epi64(_mm_shuffle_epi8(__i0, __shuf),
1242 _mm_shuffle_epi8(__i1, __shuf)));
1244 else if constexpr (__x_to_x)
1246 auto __a = _mm_unpacklo_epi8(__i0, __i1);
1247 auto __b = _mm_unpackhi_epi8(__i0, __i1);
1248 auto __c = _mm_unpacklo_epi8(__a, __b);
1249 auto __d = _mm_unpackhi_epi8(__a, __b);
1250 auto __e = _mm_unpacklo_epi8(__c, __d);
1251 auto __f = _mm_unpackhi_epi8(__c, __d);
1252 return __intrin_bitcast<_To>(_mm_unpacklo_epi8(__e, __f));
1254 else if constexpr (__y_to_y)
1256 return __intrin_bitcast<_To>(__xzyw(_mm256_shuffle_epi8(
1257 (__to_intrin(__v0) & _mm256_set1_epi32(0x00ff00ff))
1258 | _mm256_slli_epi16(__i1, 8),
1259 _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11,
1260 13, 15, 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5,
1261 7, 9, 11, 13, 15))));
1264 else if constexpr (__i64_to_f32)
1266 if constexpr (__x_to_x)
1267 return __make_wrapper<float>(__v0[0], __v0[1], __v1[0], __v1[1]);
1268 else if constexpr (__y_to_y)
1270 static_assert(__y_to_y && __have_avx2);
1271 const auto __a = _mm256_unpacklo_epi32(__i0, __i1);
1272 const auto __b = _mm256_unpackhi_epi32(__i0, __i1);
1274 = _mm256_unpacklo_epi32(__a, __b);
1275 const auto __hi32 = __vector_bitcast<
1276 conditional_t<is_signed_v<_Tp>, int, _UInt>>(
1277 _mm256_unpackhi_epi32(__a, __b));
1280 * __convert_x86<__vector_type_t<float, 8>>(__hi32);
1282 = 0x10000 * _mm256_cvtepi32_ps(_mm256_srli_epi32(__lo32, 16));
1284 = _mm256_cvtepi32_ps(_mm256_set1_epi32(0x0000ffffu) & __lo32);
1285 return __xzyw((__hi + __mid) + __lo);
1287 else if constexpr (__z_to_z && __have_avx512dq)
1289 return is_signed_v<_Tp> ? __concat(_mm512_cvtepi64_ps(__i0),
1290 _mm512_cvtepi64_ps(__i1))
1291 : __concat(_mm512_cvtepu64_ps(__i0),
1292 _mm512_cvtepu64_ps(__i1));
1294 else if constexpr (__z_to_z && is_signed_v<_Tp>)
1296 const __m512 __hi32 = _mm512_cvtepi32_ps(
1297 __concat(_mm512_cvtepi64_epi32(__to_intrin(__v0 >> 32)),
1298 _mm512_cvtepi64_epi32(__to_intrin(__v1 >> 32))));
1299 const __m512i __lo32 = __concat(_mm512_cvtepi64_epi32(__i0),
1300 _mm512_cvtepi64_epi32(__i1));
1306 = _mm512_cvtepu32_ps(_mm512_set1_epi32(0xffff0000u) & __lo32);
1308 = _mm512_cvtepi32_ps(_mm512_set1_epi32(0x0000ffffu) & __lo32);
1309 return (__hi32 * 0x100000000LL + __hi16) + __lo16;
1311 else if constexpr (__z_to_z && is_unsigned_v<_Tp>)
1313 return __intrin_bitcast<_To>(
1314 _mm512_cvtepu32_ps(__concat(
1315 _mm512_cvtepi64_epi32(_mm512_srai_epi64(__i0, 32)),
1316 _mm512_cvtepi64_epi32(_mm512_srai_epi64(__i1, 32))))
1318 + _mm512_cvtepu32_ps(__concat(_mm512_cvtepi64_epi32(__i0),
1319 _mm512_cvtepi64_epi32(__i1))));
1322 else if constexpr (__f64_to_s32)
1326 else if constexpr (__f64_to_u32)
1328 if constexpr (__x_to_x && __have_sse4_1)
1330 return __vector_bitcast<_Up, _M>(_mm_unpacklo_epi64(
1331 _mm_cvttpd_epi32(_mm_floor_pd(__i0) - 0x8000'0000u),
1332 _mm_cvttpd_epi32(_mm_floor_pd(__i1) - 0x8000'0000u)))
1337 else if constexpr (__y_to_y)
1339 return __vector_bitcast<_Up>(
1340 __concat(_mm256_cvttpd_epi32(_mm256_floor_pd(__i0)
1342 _mm256_cvttpd_epi32(_mm256_floor_pd(__i1)
1347 else if constexpr (__f64_to_ibw)
1354 return __convert_x86<_To>(
1355 __convert_x86<__vector_type_t<int, _Np * 2>>(__v0, __v1));
1358 else if constexpr (__f32_to_ibw)
1360 return __convert_x86<_To>(
1361 __convert_x86<__vector_type_t<int, _Np>>(__v0),
1362 __convert_x86<__vector_type_t<int, _Np>>(__v1));
1366 if constexpr (
sizeof(_To) >= 32)
1368 return __concat(__convert_x86<__vector_type_t<_Up, _M / 2>>(__v0),
1369 __convert_x86<__vector_type_t<_Up, _M / 2>>(__v1));
1370 else if constexpr (
sizeof(_To) == 16)
1372 const auto __lo = __to_intrin(__convert_x86<_To>(__v0));
1373 const auto __hi = __to_intrin(__convert_x86<_To>(__v1));
1374 if constexpr (
sizeof(_Up) * _Np == 8)
1376 if constexpr (is_floating_point_v<_Up>)
1377 return __auto_bitcast(
1378 _mm_unpacklo_pd(__vector_bitcast<double>(__lo),
1379 __vector_bitcast<double>(__hi)));
1381 return __intrin_bitcast<_To>(_mm_unpacklo_epi64(__lo, __hi));
1383 else if constexpr (
sizeof(_Up) * _Np == 4)
1385 if constexpr (is_floating_point_v<_Up>)
1386 return __auto_bitcast(
1387 _mm_unpacklo_ps(__vector_bitcast<float>(__lo),
1388 __vector_bitcast<float>(__hi)));
1390 return __intrin_bitcast<_To>(_mm_unpacklo_epi32(__lo, __hi));
1392 else if constexpr (
sizeof(_Up) * _Np == 2)
1393 return __intrin_bitcast<_To>(_mm_unpacklo_epi16(__lo, __hi));
1395 __assert_unreachable<_Tp>();
1398 return __vector_convert<_To>(__v0, __v1, make_index_sequence<_Np>());
1405template <
typename _To,
typename _V,
typename _Traits>
1406 _GLIBCXX_SIMD_INTRINSIC _To
1407 __convert_x86(_V __v0, _V __v1, _V __v2, _V __v3)
1409 static_assert(__is_vector_type_v<_V>);
1410 using _Tp =
typename _Traits::value_type;
1411 constexpr size_t _Np = _Traits::_S_full_size;
1412 [[maybe_unused]]
const auto __i0 = __to_intrin(__v0);
1413 [[maybe_unused]]
const auto __i1 = __to_intrin(__v1);
1414 [[maybe_unused]]
const auto __i2 = __to_intrin(__v2);
1415 [[maybe_unused]]
const auto __i3 = __to_intrin(__v3);
1416 using _Up =
typename _VectorTraits<_To>::value_type;
1417 constexpr size_t _M = _VectorTraits<_To>::_S_full_size;
1419 static_assert(4 * _Np <= _M,
1420 "__v2/__v3 would be discarded; use the two/one-argument "
1421 "__convert_x86 overload instead");
1424 [[maybe_unused]]
constexpr bool __x_to_x
1425 =
sizeof(__v0) <= 16 &&
sizeof(_To) <= 16;
1426 [[maybe_unused]]
constexpr bool __x_to_y
1427 =
sizeof(__v0) <= 16 &&
sizeof(_To) == 32;
1428 [[maybe_unused]]
constexpr bool __x_to_z
1429 =
sizeof(__v0) <= 16 &&
sizeof(_To) == 64;
1430 [[maybe_unused]]
constexpr bool __y_to_x
1431 =
sizeof(__v0) == 32 &&
sizeof(_To) <= 16;
1432 [[maybe_unused]]
constexpr bool __y_to_y
1433 =
sizeof(__v0) == 32 &&
sizeof(_To) == 32;
1434 [[maybe_unused]]
constexpr bool __y_to_z
1435 =
sizeof(__v0) == 32 &&
sizeof(_To) == 64;
1436 [[maybe_unused]]
constexpr bool __z_to_x
1437 =
sizeof(__v0) == 64 &&
sizeof(_To) <= 16;
1438 [[maybe_unused]]
constexpr bool __z_to_y
1439 =
sizeof(__v0) == 64 &&
sizeof(_To) == 32;
1440 [[maybe_unused]]
constexpr bool __z_to_z
1441 =
sizeof(__v0) == 64 &&
sizeof(_To) == 64;
1444 [[maybe_unused]]
constexpr bool __i_to_i
1445 = is_integral_v<_Up> && is_integral_v<_Tp>;
1446 [[maybe_unused]]
constexpr bool __i8_to_i16
1447 = __i_to_i &&
sizeof(_Tp) == 1 &&
sizeof(_Up) == 2;
1448 [[maybe_unused]]
constexpr bool __i8_to_i32
1449 = __i_to_i &&
sizeof(_Tp) == 1 &&
sizeof(_Up) == 4;
1450 [[maybe_unused]]
constexpr bool __i8_to_i64
1451 = __i_to_i &&
sizeof(_Tp) == 1 &&
sizeof(_Up) == 8;
1452 [[maybe_unused]]
constexpr bool __i16_to_i8
1453 = __i_to_i &&
sizeof(_Tp) == 2 &&
sizeof(_Up) == 1;
1454 [[maybe_unused]]
constexpr bool __i16_to_i32
1455 = __i_to_i &&
sizeof(_Tp) == 2 &&
sizeof(_Up) == 4;
1456 [[maybe_unused]]
constexpr bool __i16_to_i64
1457 = __i_to_i &&
sizeof(_Tp) == 2 &&
sizeof(_Up) == 8;
1458 [[maybe_unused]]
constexpr bool __i32_to_i8
1459 = __i_to_i &&
sizeof(_Tp) == 4 &&
sizeof(_Up) == 1;
1460 [[maybe_unused]]
constexpr bool __i32_to_i16
1461 = __i_to_i &&
sizeof(_Tp) == 4 &&
sizeof(_Up) == 2;
1462 [[maybe_unused]]
constexpr bool __i32_to_i64
1463 = __i_to_i &&
sizeof(_Tp) == 4 &&
sizeof(_Up) == 8;
1464 [[maybe_unused]]
constexpr bool __i64_to_i8
1465 = __i_to_i &&
sizeof(_Tp) == 8 &&
sizeof(_Up) == 1;
1466 [[maybe_unused]]
constexpr bool __i64_to_i16
1467 = __i_to_i &&
sizeof(_Tp) == 8 &&
sizeof(_Up) == 2;
1468 [[maybe_unused]]
constexpr bool __i64_to_i32
1469 = __i_to_i &&
sizeof(_Tp) == 8 &&
sizeof(_Up) == 4;
1473 [[maybe_unused]]
constexpr bool __i64_to_f32
1474 = is_integral_v<_Tp> &&
sizeof(_Tp) == 8
1475 && is_floating_point_v<_Up> &&
sizeof(_Up) == 4;
1476 [[maybe_unused]]
constexpr bool __s32_to_f32
1477 = is_integral_v<_Tp> && is_signed_v<_Tp> &&
sizeof(_Tp) == 4
1478 && is_floating_point_v<_Up> &&
sizeof(_Up) == 4;
1479 [[maybe_unused]]
constexpr bool __s16_to_f32
1480 = is_integral_v<_Tp> && is_signed_v<_Tp> &&
sizeof(_Tp) == 2
1481 && is_floating_point_v<_Up> &&
sizeof(_Up) == 4;
1482 [[maybe_unused]]
constexpr bool __s8_to_f32
1483 = is_integral_v<_Tp> && is_signed_v<_Tp> &&
sizeof(_Tp) == 1
1484 && is_floating_point_v<_Up> &&
sizeof(_Up) == 4;
1485 [[maybe_unused]]
constexpr bool __u32_to_f32
1486 = is_integral_v<_Tp> && is_unsigned_v<_Tp> &&
sizeof(_Tp) == 4
1487 && is_floating_point_v<_Up> &&
sizeof(_Up) == 4;
1488 [[maybe_unused]]
constexpr bool __u16_to_f32
1489 = is_integral_v<_Tp> && is_unsigned_v<_Tp> &&
sizeof(_Tp) == 2
1490 && is_floating_point_v<_Up> &&
sizeof(_Up) == 4;
1491 [[maybe_unused]]
constexpr bool __u8_to_f32
1492 = is_integral_v<_Tp> && is_unsigned_v<_Tp> &&
sizeof(_Tp) == 1
1493 && is_floating_point_v<_Up> &&
sizeof(_Up) == 4;
1494 [[maybe_unused]]
constexpr bool __s64_to_f64
1495 = is_integral_v<_Tp> && is_signed_v<_Tp> &&
sizeof(_Tp) == 8
1496 && is_floating_point_v<_Up> &&
sizeof(_Up) == 8;
1497 [[maybe_unused]]
constexpr bool __s32_to_f64
1498 = is_integral_v<_Tp> && is_signed_v<_Tp> &&
sizeof(_Tp) == 4
1499 && is_floating_point_v<_Up> &&
sizeof(_Up) == 8;
1500 [[maybe_unused]]
constexpr bool __s16_to_f64
1501 = is_integral_v<_Tp> && is_signed_v<_Tp> &&
sizeof(_Tp) == 2
1502 && is_floating_point_v<_Up> &&
sizeof(_Up) == 8;
1503 [[maybe_unused]]
constexpr bool __s8_to_f64
1504 = is_integral_v<_Tp> && is_signed_v<_Tp> &&
sizeof(_Tp) == 1
1505 && is_floating_point_v<_Up> &&
sizeof(_Up) == 8;
1506 [[maybe_unused]]
constexpr bool __u64_to_f64
1507 = is_integral_v<_Tp> && is_unsigned_v<_Tp> &&
sizeof(_Tp) == 8
1508 && is_floating_point_v<_Up> &&
sizeof(_Up) == 8;
1509 [[maybe_unused]]
constexpr bool __u32_to_f64
1510 = is_integral_v<_Tp> && is_unsigned_v<_Tp> &&
sizeof(_Tp) == 4
1511 && is_floating_point_v<_Up> &&
sizeof(_Up) == 8;
1512 [[maybe_unused]]
constexpr bool __u16_to_f64
1513 = is_integral_v<_Tp> && is_unsigned_v<_Tp> &&
sizeof(_Tp) == 2
1514 && is_floating_point_v<_Up> &&
sizeof(_Up) == 8;
1515 [[maybe_unused]]
constexpr bool __u8_to_f64
1516 = is_integral_v<_Tp> && is_unsigned_v<_Tp> &&
sizeof(_Tp) == 1
1517 && is_floating_point_v<_Up> &&
sizeof(_Up) == 8;
1518 [[maybe_unused]]
constexpr bool __f32_to_s64
1519 = is_integral_v<_Up> && is_signed_v<_Up> &&
sizeof(_Up) == 8
1520 && is_floating_point_v<_Tp> &&
sizeof(_Tp) == 4;
1521 [[maybe_unused]]
constexpr bool __f32_to_s32
1522 = is_integral_v<_Up> && is_signed_v<_Up> &&
sizeof(_Up) == 4
1523 && is_floating_point_v<_Tp> &&
sizeof(_Tp) == 4;
1524 [[maybe_unused]]
constexpr bool __f32_to_u64
1525 = is_integral_v<_Up> && is_unsigned_v<_Up> &&
sizeof(_Up) == 8
1526 && is_floating_point_v<_Tp> &&
sizeof(_Tp) == 4;
1527 [[maybe_unused]]
constexpr bool __f32_to_u32
1528 = is_integral_v<_Up> && is_unsigned_v<_Up> &&
sizeof(_Up) == 4
1529 && is_floating_point_v<_Tp> &&
sizeof(_Tp) == 4;
1530 [[maybe_unused]]
constexpr bool __f64_to_s64
1531 = is_integral_v<_Up> && is_signed_v<_Up> &&
sizeof(_Up) == 8
1532 && is_floating_point_v<_Tp> &&
sizeof(_Tp) == 8;
1533 [[maybe_unused]]
constexpr bool __f64_to_s32
1534 = is_integral_v<_Up> && is_signed_v<_Up> &&
sizeof(_Up) == 4
1535 && is_floating_point_v<_Tp> &&
sizeof(_Tp) == 8;
1536 [[maybe_unused]]
constexpr bool __f64_to_u64
1537 = is_integral_v<_Up> && is_unsigned_v<_Up> &&
sizeof(_Up) == 8
1538 && is_floating_point_v<_Tp> &&
sizeof(_Tp) == 8;
1539 [[maybe_unused]]
constexpr bool __f64_to_u32
1540 = is_integral_v<_Up> && is_unsigned_v<_Up> &&
sizeof(_Up) == 4
1541 && is_floating_point_v<_Tp> &&
sizeof(_Tp) == 8;
1542 [[maybe_unused]]
constexpr bool __f32_to_ibw
1543 = is_integral_v<_Up> &&
sizeof(_Up) <= 2
1544 && is_floating_point_v<_Tp> &&
sizeof(_Tp) == 4;
1545 [[maybe_unused]]
constexpr bool __f64_to_ibw
1546 = is_integral_v<_Up> &&
sizeof(_Up) <= 2
1547 && is_floating_point_v<_Tp> &&
sizeof(_Tp) == 8;
1548 [[maybe_unused]]
constexpr bool __f32_to_f64
1549 = is_floating_point_v<_Tp> &&
sizeof(_Tp) == 4
1550 && is_floating_point_v<_Up> &&
sizeof(_Up) == 8;
1551 [[maybe_unused]]
constexpr bool __f64_to_f32
1552 = is_floating_point_v<_Tp> &&
sizeof(_Tp) == 8
1553 && is_floating_point_v<_Up> &&
sizeof(_Up) == 4;
1555 if constexpr (__i_to_i && __y_to_x && !__have_avx2)
1558 return __convert_x86<_To>(__lo128(__v0), __hi128(__v0), __lo128(__v1),
1559 __hi128(__v1), __lo128(__v2), __hi128(__v2),
1560 __lo128(__v3), __hi128(__v3));
1562 else if constexpr (__i_to_i)
1564 static_assert(__x_to_x || __have_avx2,
1565 "integral conversions with ymm registers require AVX2");
1566 static_assert(__have_avx512bw
1567 || ((
sizeof(_Tp) >= 4 ||
sizeof(__v0) < 64)
1568 && (
sizeof(_Up) >= 4 ||
sizeof(_To) < 64)),
1569 "8/16-bit integers in zmm registers require AVX512BW");
1570 static_assert((
sizeof(__v0) < 64 &&
sizeof(_To) < 64) || __have_avx512f,
1571 "integral conversions with ymm registers require AVX2");
1574 if constexpr (
sizeof(__v0) < 16 || (
sizeof(__v0) == 16 && __have_avx2)
1575 || (
sizeof(__v0) == 16 && __have_avx
1576 && is_floating_point_v<_Tp>)
1577 || (
sizeof(__v0) == 32 && __have_avx512f))
1581 return __convert_x86<_To>(__concat(__v0, __v1), __concat(__v2, __v3));
1588 !(is_floating_point_v<
1589 _Tp> == is_floating_point_v<_Up> &&
sizeof(_Tp) ==
sizeof(_Up)));
1591 if constexpr (4 * _Np < _M &&
sizeof(_To) > 16)
1593 constexpr size_t Min = 16 /
sizeof(_Up);
1594 return __zero_extend(
1596 __vector_type_t<_Up, (Min > 4 * _Np) ? Min : 4 * _Np>>(
1597 __v0, __v1, __v2, __v3));
1599 else if constexpr (__i64_to_i16)
1601 if constexpr (__x_to_x && __have_sse4_1)
1603 return __intrin_bitcast<_To>(_mm_shuffle_epi8(
1605 _mm_blend_epi16(__i0, _mm_slli_si128(__i1, 2), 0x22),
1606 _mm_blend_epi16(_mm_slli_si128(__i2, 4),
1607 _mm_slli_si128(__i3, 6), 0x88),
1609 _mm_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7,
1612 else if constexpr (__y_to_y && __have_avx2)
1614 return __intrin_bitcast<_To>(_mm256_shuffle_epi8(
1615 __xzyw(_mm256_blend_epi16(
1617 _mm256_shuffle_ps(__vector_bitcast<float>(__v0),
1618 __vector_bitcast<float>(__v2),
1620 __to_intrin(__vector_bitcast<int>(_mm256_shuffle_ps(
1621 __vector_bitcast<float>(__v1),
1622 __vector_bitcast<float>(__v3), 0x88))
1626 _mm256_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11,
1627 14, 15, 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7,
1648 else if constexpr (__i64_to_i8)
1650 if constexpr (__x_to_x)
1654 else if constexpr (__y_to_x)
1657 = _mm256_srli_epi32(_mm256_slli_epi32(__i0, 24), 24)
1658 | _mm256_srli_epi32(_mm256_slli_epi32(__i1, 24), 16)
1659 | _mm256_srli_epi32(_mm256_slli_epi32(__i2, 24), 8)
1660 | _mm256_slli_epi32(
1666 auto __b = _mm256_unpackhi_epi64(
1668 auto __c = _mm256_unpacklo_epi8(
1670 return __intrin_bitcast<_To>(
1671 _mm_unpacklo_epi16(__lo128(__c),
1675 else if constexpr (__i32_to_i8)
1677 if constexpr (__x_to_x)
1679 if constexpr (__have_ssse3)
1681 const auto __x0 = __vector_bitcast<_UInt>(__v0) & 0xff;
1682 const auto __x1 = (__vector_bitcast<_UInt>(__v1) & 0xff)
1684 const auto __x2 = (__vector_bitcast<_UInt>(__v2) & 0xff)
1686 const auto __x3 = __vector_bitcast<_UInt>(__v3) << 24;
1687 return __intrin_bitcast<_To>(
1688 _mm_shuffle_epi8(__to_intrin(__x0 | __x1 | __x2 | __x3),
1689 _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13,
1690 2, 6, 10, 14, 3, 7, 11,
1696 = _mm_unpacklo_epi8(__i0, __i2);
1698 = _mm_unpackhi_epi8(__i0, __i2);
1700 = _mm_unpacklo_epi8(__i1, __i3);
1702 = _mm_unpackhi_epi8(__i1, __i3);
1704 = _mm_unpacklo_epi8(__a, __c);
1706 = _mm_unpackhi_epi8(__a, __c);
1708 = _mm_unpacklo_epi8(__b, __d);
1710 = _mm_unpackhi_epi8(__b, __d);
1711 return __intrin_bitcast<_To>(_mm_unpacklo_epi8(
1712 _mm_unpacklo_epi8(__e, __g),
1713 _mm_unpacklo_epi8(__f, __h)
1717 else if constexpr (__y_to_y)
1719 const auto __a = _mm256_shuffle_epi8(
1720 __to_intrin((__vector_bitcast<_UShort>(_mm256_blend_epi16(
1721 __i0, _mm256_slli_epi32(__i1, 16), 0xAA))
1723 | (__vector_bitcast<_UShort>(_mm256_blend_epi16(
1724 __i2, _mm256_slli_epi32(__i3, 16), 0xAA))
1726 _mm256_setr_epi8(0, 4, 8, 12, 2, 6, 10, 14, 1, 5, 9, 13, 3, 7,
1727 11, 15, 0, 4, 8, 12, 2, 6, 10, 14, 1, 5, 9,
1729 return __intrin_bitcast<_To>(_mm256_permutevar8x32_epi32(
1730 __a, _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7)));
1733 else if constexpr (__i64_to_f32)
1737 if constexpr (__x_to_y)
1739 return __make_wrapper<float>(__v0[0], __v0[1], __v1[0], __v1[1],
1740 __v2[0], __v2[1], __v3[0],
1743 const auto __a = _mm_unpacklo_epi32(__i0, __i1);
1744 const auto __b = _mm_unpackhi_epi32(__i0, __i1);
1745 const auto __c = _mm_unpacklo_epi32(__i2, __i3);
1746 const auto __d = _mm_unpackhi_epi32(__i2, __i3);
1747 const auto __lo32a = _mm_unpacklo_epi32(__a, __b);
1748 const auto __lo32b = _mm_unpacklo_epi32(__c, __d);
1749 const auto __hi32 = __vector_bitcast<
1750 conditional_t<is_signed_v<_Tp>, int, _UInt>>(
1751 __concat(_mm_unpackhi_epi32(__a, __b),
1752 _mm_unpackhi_epi32(__c, __d)));
1755 * __convert_x86<__vector_type_t<float, 8>>(__hi32);
1758 * _mm256_cvtepi32_ps(__concat(_mm_srli_epi32(__lo32a, 16),
1759 _mm_srli_epi32(__lo32b, 16)));
1760 const auto __lo = _mm256_cvtepi32_ps(
1761 __concat(_mm_set1_epi32(0x0000ffffu) & __lo32a,
1762 _mm_set1_epi32(0x0000ffffu) & __lo32b));
1763 return (__hi + __mid) + __lo;
1766 else if constexpr (__f64_to_ibw)
1768 return __convert_x86<_To>(
1769 __convert_x86<__vector_type_t<int, _Np * 2>>(__v0, __v1),
1770 __convert_x86<__vector_type_t<int, _Np * 2>>(__v2, __v3));
1772 else if constexpr (__f32_to_ibw)
1774 return __convert_x86<_To>(
1775 __convert_x86<__vector_type_t<int, _Np>>(__v0),
1776 __convert_x86<__vector_type_t<int, _Np>>(__v1),
1777 __convert_x86<__vector_type_t<int, _Np>>(__v2),
1778 __convert_x86<__vector_type_t<int, _Np>>(__v3));
1782 if constexpr (
sizeof(_To) >= 32)
1784 return __concat(__convert_x86<__vector_type_t<_Up, _M / 2>>(__v0,
1786 __convert_x86<__vector_type_t<_Up, _M / 2>>(__v2,
1788 else if constexpr (
sizeof(_To) == 16)
1790 const auto __lo = __to_intrin(__convert_x86<_To>(__v0, __v1));
1791 const auto __hi = __to_intrin(__convert_x86<_To>(__v2, __v3));
1792 if constexpr (
sizeof(_Up) * _Np * 2 == 8)
1794 if constexpr (is_floating_point_v<_Up>)
1795 return __auto_bitcast(_mm_unpacklo_pd(__lo, __hi));
1797 return __intrin_bitcast<_To>(_mm_unpacklo_epi64(__lo, __hi));
1799 else if constexpr (
sizeof(_Up) * _Np * 2 == 4)
1801 if constexpr (is_floating_point_v<_Up>)
1802 return __auto_bitcast(_mm_unpacklo_ps(__lo, __hi));
1804 return __intrin_bitcast<_To>(_mm_unpacklo_epi32(__lo, __hi));
1807 __assert_unreachable<_Tp>();
1810 return __vector_convert<_To>(__v0, __v1, __v2, __v3,
1811 make_index_sequence<_Np>());
1818template <
typename _To,
typename _V,
typename _Traits>
1819 _GLIBCXX_SIMD_INTRINSIC _To
1820 __convert_x86(_V __v0, _V __v1, _V __v2, _V __v3, _V __v4, _V __v5, _V __v6,
1823 static_assert(__is_vector_type_v<_V>);
1824 using _Tp =
typename _Traits::value_type;
1825 constexpr size_t _Np = _Traits::_S_full_size;
1826 [[maybe_unused]]
const auto __i0 = __to_intrin(__v0);
1827 [[maybe_unused]]
const auto __i1 = __to_intrin(__v1);
1828 [[maybe_unused]]
const auto __i2 = __to_intrin(__v2);
1829 [[maybe_unused]]
const auto __i3 = __to_intrin(__v3);
1830 [[maybe_unused]]
const auto __i4 = __to_intrin(__v4);
1831 [[maybe_unused]]
const auto __i5 = __to_intrin(__v5);
1832 [[maybe_unused]]
const auto __i6 = __to_intrin(__v6);
1833 [[maybe_unused]]
const auto __i7 = __to_intrin(__v7);
1834 using _Up =
typename _VectorTraits<_To>::value_type;
1835 constexpr size_t _M = _VectorTraits<_To>::_S_full_size;
1837 static_assert(8 * _Np <= _M,
1838 "__v4-__v7 would be discarded; use the four/two/one-argument "
1839 "__convert_x86 overload instead");
1842 [[maybe_unused]]
constexpr bool __x_to_x
1843 =
sizeof(__v0) <= 16 &&
sizeof(_To) <= 16;
1844 [[maybe_unused]]
constexpr bool __x_to_y
1845 =
sizeof(__v0) <= 16 &&
sizeof(_To) == 32;
1846 [[maybe_unused]]
constexpr bool __x_to_z
1847 =
sizeof(__v0) <= 16 &&
sizeof(_To) == 64;
1848 [[maybe_unused]]
constexpr bool __y_to_x
1849 =
sizeof(__v0) == 32 &&
sizeof(_To) <= 16;
1850 [[maybe_unused]]
constexpr bool __y_to_y
1851 =
sizeof(__v0) == 32 &&
sizeof(_To) == 32;
1852 [[maybe_unused]]
constexpr bool __y_to_z
1853 =
sizeof(__v0) == 32 &&
sizeof(_To) == 64;
1854 [[maybe_unused]]
constexpr bool __z_to_x
1855 =
sizeof(__v0) == 64 &&
sizeof(_To) <= 16;
1856 [[maybe_unused]]
constexpr bool __z_to_y
1857 =
sizeof(__v0) == 64 &&
sizeof(_To) == 32;
1858 [[maybe_unused]]
constexpr bool __z_to_z
1859 =
sizeof(__v0) == 64 &&
sizeof(_To) == 64;
1862 [[maybe_unused]]
constexpr bool __i_to_i
1863 = is_integral_v<_Up> && is_integral_v<_Tp>;
1864 [[maybe_unused]]
constexpr bool __i64_to_i8
1865 = __i_to_i &&
sizeof(_Tp) == 8 &&
sizeof(_Up) == 1;
1866 [[maybe_unused]]
constexpr bool __f64_to_i8
1867 = is_integral_v<_Up> &&
sizeof(_Up) == 1
1868 && is_floating_point_v<_Tp> &&
sizeof(_Tp) == 8;
1870 if constexpr (__i_to_i)
1872 static_assert(__x_to_x || __have_avx2,
1873 "integral conversions with ymm registers require AVX2");
1874 static_assert(__have_avx512bw
1875 || ((
sizeof(_Tp) >= 4 ||
sizeof(__v0) < 64)
1876 && (
sizeof(_Up) >= 4 ||
sizeof(_To) < 64)),
1877 "8/16-bit integers in zmm registers require AVX512BW");
1878 static_assert((
sizeof(__v0) < 64 &&
sizeof(_To) < 64) || __have_avx512f,
1879 "integral conversions with ymm registers require AVX2");
1882 if constexpr (
sizeof(__v0) < 16 || (
sizeof(__v0) == 16 && __have_avx2)
1883 || (
sizeof(__v0) == 16 && __have_avx
1884 && is_floating_point_v<_Tp>)
1885 || (
sizeof(__v0) == 32 && __have_avx512f))
1889 return __convert_x86<_To>(__concat(__v0, __v1), __concat(__v2, __v3),
1890 __concat(__v4, __v5), __concat(__v6, __v7));
1897 !(is_floating_point_v<
1898 _Tp> == is_floating_point_v<_Up> &&
sizeof(_Tp) ==
sizeof(_Up)));
1899 static_assert(!(8 * _Np < _M &&
sizeof(_To) > 16),
1900 "zero extension should be impossible");
1901 if constexpr (__i64_to_i8)
1903 if constexpr (__x_to_x && __have_ssse3)
1906 return __intrin_bitcast<_To>(_mm_shuffle_epi8(
1908 (((__v0 & 0xff) | ((__v1 & 0xff) << 8))
1909 | (((__v2 & 0xff) << 16) | ((__v3 & 0xff) << 24)))
1910 | ((((__v4 & 0xff) << 32) | ((__v5 & 0xff) << 40))
1911 | (((__v6 & 0xff) << 48) | (__v7 << 56)))),
1912 _mm_setr_epi8(0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14,
1915 else if constexpr (__x_to_x)
1917 const auto __a = _mm_unpacklo_epi8(__i0, __i1);
1918 const auto __b = _mm_unpackhi_epi8(__i0, __i1);
1919 const auto __c = _mm_unpacklo_epi8(__i2, __i3);
1920 const auto __d = _mm_unpackhi_epi8(__i2, __i3);
1921 const auto __e = _mm_unpacklo_epi8(__i4, __i5);
1922 const auto __f = _mm_unpackhi_epi8(__i4, __i5);
1923 const auto __g = _mm_unpacklo_epi8(__i6, __i7);
1924 const auto __h = _mm_unpackhi_epi8(__i6, __i7);
1925 return __intrin_bitcast<_To>(_mm_unpacklo_epi64(
1926 _mm_unpacklo_epi32(_mm_unpacklo_epi8(__a, __b),
1927 _mm_unpacklo_epi8(__c, __d)),
1928 _mm_unpacklo_epi32(_mm_unpacklo_epi8(__e, __f),
1929 _mm_unpacklo_epi8(__g, __h))
1932 else if constexpr (__y_to_y)
1936 (((__v0 & 0xff) | ((__v1 & 0xff) << 8))
1937 | (((__v2 & 0xff) << 16) | ((__v3 & 0xff) << 24)))
1938 | ((((__v4 & 0xff) << 32) | ((__v5 & 0xff) << 40))
1939 | (((__v6 & 0xff) << 48) | ((__v7 << 56)))));
1950 auto __b = _mm256_shuffle_epi8(
1952 __a, _mm256_setr_epi8(0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13,
1953 6, 14, 7, 15, 0, 8, 1, 9, 2, 10, 3, 11,
1954 4, 12, 5, 13, 6, 14, 7, 15));
1957 return __intrin_bitcast<_To>(_mm256_shuffle_epi8(
1958 __c, _mm256_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13,
1959 6, 7, 14, 15, 0, 1, 8, 9, 2, 3, 10, 11,
1960 4, 5, 12, 13, 6, 7, 14, 15)));
1962 else if constexpr (__z_to_z)
1965 __convert_x86<__vector_type_t<_Up, _M / 2>>(__v0, __v1, __v2,
1967 __convert_x86<__vector_type_t<_Up, _M / 2>>(__v4, __v5, __v6,
1971 else if constexpr (__f64_to_i8)
1973 return __convert_x86<_To>(
1974 __convert_x86<__vector_type_t<int, _Np * 2>>(__v0, __v1),
1975 __convert_x86<__vector_type_t<int, _Np * 2>>(__v2, __v3),
1976 __convert_x86<__vector_type_t<int, _Np * 2>>(__v4, __v5),
1977 __convert_x86<__vector_type_t<int, _Np * 2>>(__v6, __v7));
1980 __assert_unreachable<_Tp>();
1984 if constexpr (
sizeof(_To) >= 32)
1987 __convert_x86<__vector_type_t<_Up, _M / 2>>(__v0, __v1, __v2, __v3),
1988 __convert_x86<__vector_type_t<_Up, _M / 2>>(__v4, __v5, __v6,
1990 else if constexpr (
sizeof(_To) == 16)
1993 = __to_intrin(__convert_x86<_To>(__v0, __v1, __v2, __v3));
1995 = __to_intrin(__convert_x86<_To>(__v4, __v5, __v6, __v7));
1996 static_assert(
sizeof(_Up) == 1 && _Np == 2);
1997 return __intrin_bitcast<_To>(_mm_unpacklo_epi64(__lo, __hi));
2001 __assert_unreachable<_Tp>();
2011template <
typename _To,
typename _V,
typename _Traits>
2012 _GLIBCXX_SIMD_INTRINSIC _To
2013 __convert_x86(_V __v0, _V __v1, _V __v2, _V __v3, _V __v4, _V __v5, _V __v6,
2014 _V __v7, _V __v8, _V __v9, _V __v10, _V __v11, _V __v12,
2015 _V __v13, _V __v14, _V __v15)
2018 return __convert_x86<_To>(__concat(__v0, __v1), __concat(__v2, __v3),
2019 __concat(__v4, __v5), __concat(__v6, __v7),
2020 __concat(__v8, __v9), __concat(__v10, __v11),
2021 __concat(__v12, __v13), __concat(__v14, __v15));
constexpr const _Tp & max(const _Tp &, const _Tp &)
This does what you think it does.
constexpr const _Tp & min(const _Tp &, const _Tp &)
This does what you think it does.