libstdc++
simd_x86.h
1// Simd x86 specific implementations -*- C++ -*-
2
3// Copyright (C) 2020-2022 Free Software Foundation, Inc.
4//
5// This file is part of the GNU ISO C++ Library. This library is free
6// software; you can redistribute it and/or modify it under the
7// terms of the GNU General Public License as published by the
8// Free Software Foundation; either version 3, or (at your option)
9// any later version.
10
11// This library is distributed in the hope that it will be useful,
12// but WITHOUT ANY WARRANTY; without even the implied warranty of
13// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14// GNU General Public License for more details.
15
16// Under Section 7 of GPL version 3, you are granted additional
17// permissions described in the GCC Runtime Library Exception, version
18// 3.1, as published by the Free Software Foundation.
19
20// You should have received a copy of the GNU General Public License and
21// a copy of the GCC Runtime Library Exception along with this program;
22// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23// <http://www.gnu.org/licenses/>.
24
25#ifndef _GLIBCXX_EXPERIMENTAL_SIMD_X86_H_
26#define _GLIBCXX_EXPERIMENTAL_SIMD_X86_H_
27
28#if __cplusplus >= 201703L
29
30#if !_GLIBCXX_SIMD_X86INTRIN
31#error \
32 "simd_x86.h may only be included when MMX or SSE on x86(_64) are available"
33#endif
34
35_GLIBCXX_SIMD_BEGIN_NAMESPACE
36
37// __to_masktype {{{
38// Given <T, N> return <__int_for_sizeof_t<T>, N>. For _SimdWrapper and
39// __vector_type_t.
40template <typename _Tp, size_t _Np>
41 _GLIBCXX_SIMD_INTRINSIC constexpr _SimdWrapper<__int_for_sizeof_t<_Tp>, _Np>
42 __to_masktype(_SimdWrapper<_Tp, _Np> __x)
43 {
44 return reinterpret_cast<__vector_type_t<__int_for_sizeof_t<_Tp>, _Np>>(
45 __x._M_data);
46 }
47
48template <typename _TV,
49 typename _TVT
50 = enable_if_t<__is_vector_type_v<_TV>, _VectorTraits<_TV>>,
51 typename _Up = __int_for_sizeof_t<typename _TVT::value_type>>
52 _GLIBCXX_SIMD_INTRINSIC constexpr __vector_type_t<_Up, _TVT::_S_full_size>
53 __to_masktype(_TV __x)
54 { return reinterpret_cast<__vector_type_t<_Up, _TVT::_S_full_size>>(__x); }
55
56// }}}
57// __interleave128_lo {{{
58template <typename _Ap, typename _Bp, typename _Tp = common_type_t<_Ap, _Bp>,
59 typename _Trait = _VectorTraits<_Tp>>
60 _GLIBCXX_SIMD_INTRINSIC constexpr _Tp
61 __interleave128_lo(const _Ap& __av, const _Bp& __bv)
62 {
63 const _Tp __a(__av);
64 const _Tp __b(__bv);
65 if constexpr (sizeof(_Tp) == 16 && _Trait::_S_full_size == 2)
66 return _Tp{__a[0], __b[0]};
67 else if constexpr (sizeof(_Tp) == 16 && _Trait::_S_full_size == 4)
68 return _Tp{__a[0], __b[0], __a[1], __b[1]};
69 else if constexpr (sizeof(_Tp) == 16 && _Trait::_S_full_size == 8)
70 return _Tp{__a[0], __b[0], __a[1], __b[1],
71 __a[2], __b[2], __a[3], __b[3]};
72 else if constexpr (sizeof(_Tp) == 16 && _Trait::_S_full_size == 16)
73 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[2], __b[2],
74 __a[3], __b[3], __a[4], __b[4], __a[5], __b[5],
75 __a[6], __b[6], __a[7], __b[7]};
76 else if constexpr (sizeof(_Tp) == 32 && _Trait::_S_full_size == 4)
77 return _Tp{__a[0], __b[0], __a[2], __b[2]};
78 else if constexpr (sizeof(_Tp) == 32 && _Trait::_S_full_size == 8)
79 return _Tp{__a[0], __b[0], __a[1], __b[1],
80 __a[4], __b[4], __a[5], __b[5]};
81 else if constexpr (sizeof(_Tp) == 32 && _Trait::_S_full_size == 16)
82 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[2], __b[2],
83 __a[3], __b[3], __a[8], __b[8], __a[9], __b[9],
84 __a[10], __b[10], __a[11], __b[11]};
85 else if constexpr (sizeof(_Tp) == 32 && _Trait::_S_full_size == 32)
86 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[2], __b[2], __a[3],
87 __b[3], __a[4], __b[4], __a[5], __b[5], __a[6], __b[6],
88 __a[7], __b[7], __a[16], __b[16], __a[17], __b[17], __a[18],
89 __b[18], __a[19], __b[19], __a[20], __b[20], __a[21], __b[21],
90 __a[22], __b[22], __a[23], __b[23]};
91 else if constexpr (sizeof(_Tp) == 64 && _Trait::_S_full_size == 8)
92 return _Tp{__a[0], __b[0], __a[2], __b[2],
93 __a[4], __b[4], __a[6], __b[6]};
94 else if constexpr (sizeof(_Tp) == 64 && _Trait::_S_full_size == 16)
95 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[4], __b[4],
96 __a[5], __b[5], __a[8], __b[8], __a[9], __b[9],
97 __a[12], __b[12], __a[13], __b[13]};
98 else if constexpr (sizeof(_Tp) == 64 && _Trait::_S_full_size == 32)
99 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[2], __b[2], __a[3],
100 __b[3], __a[8], __b[8], __a[9], __b[9], __a[10], __b[10],
101 __a[11], __b[11], __a[16], __b[16], __a[17], __b[17], __a[18],
102 __b[18], __a[19], __b[19], __a[24], __b[24], __a[25], __b[25],
103 __a[26], __b[26], __a[27], __b[27]};
104 else if constexpr (sizeof(_Tp) == 64 && _Trait::_S_full_size == 64)
105 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[2], __b[2], __a[3],
106 __b[3], __a[4], __b[4], __a[5], __b[5], __a[6], __b[6],
107 __a[7], __b[7], __a[16], __b[16], __a[17], __b[17], __a[18],
108 __b[18], __a[19], __b[19], __a[20], __b[20], __a[21], __b[21],
109 __a[22], __b[22], __a[23], __b[23], __a[32], __b[32], __a[33],
110 __b[33], __a[34], __b[34], __a[35], __b[35], __a[36], __b[36],
111 __a[37], __b[37], __a[38], __b[38], __a[39], __b[39], __a[48],
112 __b[48], __a[49], __b[49], __a[50], __b[50], __a[51], __b[51],
113 __a[52], __b[52], __a[53], __b[53], __a[54], __b[54], __a[55],
114 __b[55]};
115 else
116 __assert_unreachable<_Tp>();
117 }
118
119// }}}
120// __is_zero{{{
121template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
122 _GLIBCXX_SIMD_INTRINSIC constexpr bool
123 __is_zero(_Tp __a)
124 {
125 if (!__builtin_is_constant_evaluated())
126 {
127 if constexpr (__have_avx)
128 {
129 if constexpr (_TVT::template _S_is<float, 8>)
130 return _mm256_testz_ps(__a, __a);
131 else if constexpr (_TVT::template _S_is<double, 4>)
132 return _mm256_testz_pd(__a, __a);
133 else if constexpr (sizeof(_Tp) == 32)
134 return _mm256_testz_si256(__to_intrin(__a), __to_intrin(__a));
135 else if constexpr (_TVT::template _S_is<float>)
136 return _mm_testz_ps(__to_intrin(__a), __to_intrin(__a));
137 else if constexpr (_TVT::template _S_is<double, 2>)
138 return _mm_testz_pd(__a, __a);
139 else
140 return _mm_testz_si128(__to_intrin(__a), __to_intrin(__a));
141 }
142 else if constexpr (__have_sse4_1)
143 return _mm_testz_si128(__intrin_bitcast<__m128i>(__a),
144 __intrin_bitcast<__m128i>(__a));
145 }
146 else if constexpr (sizeof(_Tp) <= 8)
147 return reinterpret_cast<__int_for_sizeof_t<_Tp>>(__a) == 0;
148 else
149 {
150 const auto __b = __vector_bitcast<_LLong>(__a);
151 if constexpr (sizeof(__b) == 16)
152 return (__b[0] | __b[1]) == 0;
153 else if constexpr (sizeof(__b) == 32)
154 return __is_zero(__lo128(__b) | __hi128(__b));
155 else if constexpr (sizeof(__b) == 64)
156 return __is_zero(__lo256(__b) | __hi256(__b));
157 else
158 __assert_unreachable<_Tp>();
159 }
160 }
161
162// }}}
163// __movemask{{{
164template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
165 _GLIBCXX_SIMD_INTRINSIC _GLIBCXX_CONST int
166 __movemask(_Tp __a)
167 {
168 if constexpr (sizeof(_Tp) == 32)
169 {
170 if constexpr (_TVT::template _S_is<float>)
171 return _mm256_movemask_ps(__to_intrin(__a));
172 else if constexpr (_TVT::template _S_is<double>)
173 return _mm256_movemask_pd(__to_intrin(__a));
174 else
175 return _mm256_movemask_epi8(__to_intrin(__a));
176 }
177 else if constexpr (_TVT::template _S_is<float>)
178 return _mm_movemask_ps(__to_intrin(__a));
179 else if constexpr (_TVT::template _S_is<double>)
180 return _mm_movemask_pd(__to_intrin(__a));
181 else
182 return _mm_movemask_epi8(__to_intrin(__a));
183 }
184
185// }}}
186// __testz{{{
187template <typename _TI, typename _TVT = _VectorTraits<_TI>>
188 _GLIBCXX_SIMD_INTRINSIC _GLIBCXX_CONST constexpr int
189 __testz(_TI __a, _TI __b)
190 {
191 static_assert(is_same_v<_TI, __intrinsic_type_t<typename _TVT::value_type,
192 _TVT::_S_full_size>>);
193 if (!__builtin_is_constant_evaluated())
194 {
195 if constexpr (sizeof(_TI) == 32)
196 {
197 if constexpr (_TVT::template _S_is<float>)
198 return _mm256_testz_ps(__to_intrin(__a), __to_intrin(__b));
199 else if constexpr (_TVT::template _S_is<double>)
200 return _mm256_testz_pd(__to_intrin(__a), __to_intrin(__b));
201 else
202 return _mm256_testz_si256(__to_intrin(__a), __to_intrin(__b));
203 }
204 else if constexpr (_TVT::template _S_is<float> && __have_avx)
205 return _mm_testz_ps(__to_intrin(__a), __to_intrin(__b));
206 else if constexpr (_TVT::template _S_is<double> && __have_avx)
207 return _mm_testz_pd(__to_intrin(__a), __to_intrin(__b));
208 else if constexpr (__have_sse4_1)
209 return _mm_testz_si128(__intrin_bitcast<__m128i>(__to_intrin(__a)),
210 __intrin_bitcast<__m128i>(__to_intrin(__b)));
211 else
212 return __movemask(0 == __and(__a, __b)) != 0;
213 }
214 else
215 return __is_zero(__and(__a, __b));
216 }
217
218// }}}
219// __testc{{{
220// requires SSE4.1 or above
221template <typename _TI, typename _TVT = _VectorTraits<_TI>>
222 _GLIBCXX_SIMD_INTRINSIC _GLIBCXX_CONST constexpr int
223 __testc(_TI __a, _TI __b)
224 {
225 static_assert(is_same_v<_TI, __intrinsic_type_t<typename _TVT::value_type,
226 _TVT::_S_full_size>>);
227 if (__builtin_is_constant_evaluated())
228 return __is_zero(__andnot(__a, __b));
229
230 if constexpr (sizeof(_TI) == 32)
231 {
232 if constexpr (_TVT::template _S_is<float>)
233 return _mm256_testc_ps(__a, __b);
234 else if constexpr (_TVT::template _S_is<double>)
235 return _mm256_testc_pd(__a, __b);
236 else
237 return _mm256_testc_si256(__to_intrin(__a), __to_intrin(__b));
238 }
239 else if constexpr (_TVT::template _S_is<float> && __have_avx)
240 return _mm_testc_ps(__to_intrin(__a), __to_intrin(__b));
241 else if constexpr (_TVT::template _S_is<double> && __have_avx)
242 return _mm_testc_pd(__to_intrin(__a), __to_intrin(__b));
243 else
244 {
245 static_assert(is_same_v<_TI, _TI> && __have_sse4_1);
246 return _mm_testc_si128(__intrin_bitcast<__m128i>(__to_intrin(__a)),
247 __intrin_bitcast<__m128i>(__to_intrin(__b)));
248 }
249 }
250
251// }}}
252// __testnzc{{{
253template <typename _TI, typename _TVT = _VectorTraits<_TI>>
254 _GLIBCXX_SIMD_INTRINSIC _GLIBCXX_CONST constexpr int
255 __testnzc(_TI __a, _TI __b)
256 {
257 static_assert(is_same_v<_TI, __intrinsic_type_t<typename _TVT::value_type,
258 _TVT::_S_full_size>>);
259 if (!__builtin_is_constant_evaluated())
260 {
261 if constexpr (sizeof(_TI) == 32)
262 {
263 if constexpr (_TVT::template _S_is<float>)
264 return _mm256_testnzc_ps(__a, __b);
265 else if constexpr (_TVT::template _S_is<double>)
266 return _mm256_testnzc_pd(__a, __b);
267 else
268 return _mm256_testnzc_si256(__to_intrin(__a), __to_intrin(__b));
269 }
270 else if constexpr (_TVT::template _S_is<float> && __have_avx)
271 return _mm_testnzc_ps(__to_intrin(__a), __to_intrin(__b));
272 else if constexpr (_TVT::template _S_is<double> && __have_avx)
273 return _mm_testnzc_pd(__to_intrin(__a), __to_intrin(__b));
274 else if constexpr (__have_sse4_1)
275 return _mm_testnzc_si128(__intrin_bitcast<__m128i>(__to_intrin(__a)),
276 __intrin_bitcast<__m128i>(__to_intrin(__b)));
277 else
278 return __movemask(0 == __and(__a, __b)) == 0
279 && __movemask(0 == __andnot(__a, __b)) == 0;
280 }
281 else
282 return !(__is_zero(__and(__a, __b)) || __is_zero(__andnot(__a, __b)));
283 }
284
285// }}}
286// __xzyw{{{
287// shuffles the complete vector, swapping the inner two quarters. Often useful
288// for AVX for fixing up a shuffle result.
289template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
290 _GLIBCXX_SIMD_INTRINSIC _Tp
291 __xzyw(_Tp __a)
292 {
293 if constexpr (sizeof(_Tp) == 16)
294 {
295 const auto __x = __vector_bitcast<conditional_t<
296 is_floating_point_v<typename _TVT::value_type>, float, int>>(__a);
297 return reinterpret_cast<_Tp>(
298 decltype(__x){__x[0], __x[2], __x[1], __x[3]});
299 }
300 else if constexpr (sizeof(_Tp) == 32)
301 {
302 const auto __x = __vector_bitcast<conditional_t<
303 is_floating_point_v<typename _TVT::value_type>, double, _LLong>>(__a);
304 return reinterpret_cast<_Tp>(
305 decltype(__x){__x[0], __x[2], __x[1], __x[3]});
306 }
307 else if constexpr (sizeof(_Tp) == 64)
308 {
309 const auto __x = __vector_bitcast<conditional_t<
310 is_floating_point_v<typename _TVT::value_type>, double, _LLong>>(__a);
311 return reinterpret_cast<_Tp>(decltype(__x){__x[0], __x[1], __x[4],
312 __x[5], __x[2], __x[3],
313 __x[6], __x[7]});
314 }
315 else
316 __assert_unreachable<_Tp>();
317 }
318
319// }}}
320// __maskload_epi32{{{
321template <typename _Tp>
322 _GLIBCXX_SIMD_INTRINSIC auto
323 __maskload_epi32(const int* __ptr, _Tp __k)
324 {
325 if constexpr (sizeof(__k) == 16)
326 return _mm_maskload_epi32(__ptr, __k);
327 else
328 return _mm256_maskload_epi32(__ptr, __k);
329 }
330
331// }}}
332// __maskload_epi64{{{
333template <typename _Tp>
334 _GLIBCXX_SIMD_INTRINSIC auto
335 __maskload_epi64(const _LLong* __ptr, _Tp __k)
336 {
337 if constexpr (sizeof(__k) == 16)
338 return _mm_maskload_epi64(__ptr, __k);
339 else
340 return _mm256_maskload_epi64(__ptr, __k);
341 }
342
343// }}}
344// __maskload_ps{{{
345template <typename _Tp>
346 _GLIBCXX_SIMD_INTRINSIC auto
347 __maskload_ps(const float* __ptr, _Tp __k)
348 {
349 if constexpr (sizeof(__k) == 16)
350 return _mm_maskload_ps(__ptr, __k);
351 else
352 return _mm256_maskload_ps(__ptr, __k);
353 }
354
355// }}}
356// __maskload_pd{{{
357template <typename _Tp>
358 _GLIBCXX_SIMD_INTRINSIC auto
359 __maskload_pd(const double* __ptr, _Tp __k)
360 {
361 if constexpr (sizeof(__k) == 16)
362 return _mm_maskload_pd(__ptr, __k);
363 else
364 return _mm256_maskload_pd(__ptr, __k);
365 }
366
367// }}}
368
369#ifdef _GLIBCXX_SIMD_WORKAROUND_PR85048
370#include "simd_x86_conversions.h"
371#endif
372
373// ISA & type detection {{{
374template <typename _Tp, size_t _Np>
375 constexpr bool
376 __is_sse_ps()
377 {
378 return __have_sse
379 && is_same_v<_Tp,
380 float> && sizeof(__intrinsic_type_t<_Tp, _Np>) == 16;
381 }
382
383template <typename _Tp, size_t _Np>
384 constexpr bool
385 __is_sse_pd()
386 {
387 return __have_sse2
388 && is_same_v<_Tp,
389 double> && sizeof(__intrinsic_type_t<_Tp, _Np>) == 16;
390 }
391
392template <typename _Tp, size_t _Np>
393 constexpr bool
394 __is_avx_ps()
395 {
396 return __have_avx
397 && is_same_v<_Tp,
398 float> && sizeof(__intrinsic_type_t<_Tp, _Np>) == 32;
399 }
400
401template <typename _Tp, size_t _Np>
402 constexpr bool
403 __is_avx_pd()
404 {
405 return __have_avx
406 && is_same_v<_Tp,
407 double> && sizeof(__intrinsic_type_t<_Tp, _Np>) == 32;
408 }
409
410template <typename _Tp, size_t _Np>
411 constexpr bool
412 __is_avx512_ps()
413 {
414 return __have_avx512f
415 && is_same_v<_Tp,
416 float> && sizeof(__intrinsic_type_t<_Tp, _Np>) == 64;
417 }
418
419template <typename _Tp, size_t _Np>
420 constexpr bool
421 __is_avx512_pd()
422 {
423 return __have_avx512f
424 && is_same_v<_Tp,
425 double> && sizeof(__intrinsic_type_t<_Tp, _Np>) == 64;
426 }
427
428// }}}
429struct _MaskImplX86Mixin;
430
431// _CommonImplX86 {{{
432struct _CommonImplX86 : _CommonImplBuiltin
433{
434#ifdef _GLIBCXX_SIMD_WORKAROUND_PR85048
435 // _S_converts_via_decomposition {{{
436 template <typename _From, typename _To, size_t _ToSize>
437 static constexpr bool _S_converts_via_decomposition()
438 {
439 if constexpr (is_integral_v<
440 _From> && is_integral_v<_To> && sizeof(_From) == 8
441 && _ToSize == 16)
442 return (sizeof(_To) == 2 && !__have_ssse3)
443 || (sizeof(_To) == 1 && !__have_avx512f);
444 else if constexpr (is_floating_point_v<_From> && is_integral_v<_To>)
445 return ((sizeof(_From) == 4 || sizeof(_From) == 8) && sizeof(_To) == 8
446 && !__have_avx512dq)
447 || (sizeof(_From) == 8 && sizeof(_To) == 4 && !__have_sse4_1
448 && _ToSize == 16);
449 else if constexpr (
450 is_integral_v<_From> && is_floating_point_v<_To> && sizeof(_From) == 8
451 && !__have_avx512dq)
452 return (sizeof(_To) == 4 && _ToSize == 16)
453 || (sizeof(_To) == 8 && _ToSize < 64);
454 else
455 return false;
456 }
457
458 template <typename _From, typename _To, size_t _ToSize>
459 static inline constexpr bool __converts_via_decomposition_v
460 = _S_converts_via_decomposition<_From, _To, _ToSize>();
461
462 // }}}
463#endif
464 // _S_store {{{
465 using _CommonImplBuiltin::_S_store;
466
467 template <typename _Tp, size_t _Np>
468 _GLIBCXX_SIMD_INTRINSIC static void _S_store(_SimdWrapper<_Tp, _Np> __x,
469 void* __addr)
470 {
471 constexpr size_t _Bytes = _Np * sizeof(_Tp);
472
473 if constexpr ((_Bytes & (_Bytes - 1)) != 0 && __have_avx512bw_vl)
474 {
475 const auto __v = __to_intrin(__x);
476
477 if constexpr (_Bytes & 1)
478 {
479 if constexpr (_Bytes < 16)
480 _mm_mask_storeu_epi8(__addr, 0xffffu >> (16 - _Bytes),
481 __intrin_bitcast<__m128i>(__v));
482 else if constexpr (_Bytes < 32)
483 _mm256_mask_storeu_epi8(__addr, 0xffffffffu >> (32 - _Bytes),
484 __intrin_bitcast<__m256i>(__v));
485 else
486 _mm512_mask_storeu_epi8(__addr,
487 0xffffffffffffffffull >> (64 - _Bytes),
488 __intrin_bitcast<__m512i>(__v));
489 }
490 else if constexpr (_Bytes & 2)
491 {
492 if constexpr (_Bytes < 16)
493 _mm_mask_storeu_epi16(__addr, 0xffu >> (8 - _Bytes / 2),
494 __intrin_bitcast<__m128i>(__v));
495 else if constexpr (_Bytes < 32)
496 _mm256_mask_storeu_epi16(__addr, 0xffffu >> (16 - _Bytes / 2),
497 __intrin_bitcast<__m256i>(__v));
498 else
499 _mm512_mask_storeu_epi16(__addr,
500 0xffffffffull >> (32 - _Bytes / 2),
501 __intrin_bitcast<__m512i>(__v));
502 }
503 else if constexpr (_Bytes & 4)
504 {
505 if constexpr (_Bytes < 16)
506 _mm_mask_storeu_epi32(__addr, 0xfu >> (4 - _Bytes / 4),
507 __intrin_bitcast<__m128i>(__v));
508 else if constexpr (_Bytes < 32)
509 _mm256_mask_storeu_epi32(__addr, 0xffu >> (8 - _Bytes / 4),
510 __intrin_bitcast<__m256i>(__v));
511 else
512 _mm512_mask_storeu_epi32(__addr, 0xffffull >> (16 - _Bytes / 4),
513 __intrin_bitcast<__m512i>(__v));
514 }
515 else
516 {
517 static_assert(
518 _Bytes > 16,
519 "_Bytes < 16 && (_Bytes & 7) == 0 && (_Bytes & (_Bytes "
520 "- 1)) != 0 is impossible");
521 if constexpr (_Bytes < 32)
522 _mm256_mask_storeu_epi64(__addr, 0xfu >> (4 - _Bytes / 8),
523 __intrin_bitcast<__m256i>(__v));
524 else
525 _mm512_mask_storeu_epi64(__addr, 0xffull >> (8 - _Bytes / 8),
526 __intrin_bitcast<__m512i>(__v));
527 }
528 }
529 else
530 _CommonImplBuiltin::_S_store(__x, __addr);
531 }
532
533 // }}}
534 // _S_store_bool_array(_BitMask) {{{
535 template <size_t _Np, bool _Sanitized>
536 _GLIBCXX_SIMD_INTRINSIC static constexpr void
537 _S_store_bool_array(const _BitMask<_Np, _Sanitized> __x, bool* __mem)
538 {
539 if constexpr (__have_avx512bw_vl) // don't care for BW w/o VL
540 _S_store<_Np>(1 & __vector_bitcast<_UChar, _Np>([=]() constexpr {
541 if constexpr (_Np <= 16)
542 return _mm_movm_epi8(__x._M_to_bits());
543 else if constexpr (_Np <= 32)
544 return _mm256_movm_epi8(__x._M_to_bits());
545 else if constexpr (_Np <= 64)
546 return _mm512_movm_epi8(__x._M_to_bits());
547 else
548 __assert_unreachable<_SizeConstant<_Np>>();
549 }()),
550 __mem);
551 else if constexpr (__have_bmi2)
552 {
553 if constexpr (_Np <= 4)
554 _S_store<_Np>(_pdep_u32(__x._M_to_bits(), 0x01010101U), __mem);
555 else
556 __execute_n_times<__div_roundup(_Np, sizeof(size_t))>(
557 [&](auto __i) {
558 constexpr size_t __offset = __i * sizeof(size_t);
559 constexpr int __todo = std::min(sizeof(size_t), _Np - __offset);
560 if constexpr (__todo == 1)
561 __mem[__offset] = __x[__offset];
562 else
563 {
564 const auto __bools =
565#ifdef __x86_64__
566 _pdep_u64(__x.template _M_extract<__offset>().to_ullong(),
567 0x0101010101010101ULL);
568#else // __x86_64__
569 _pdep_u32(
570 __x.template _M_extract<__offset>()._M_to_bits(),
571 0x01010101U);
572#endif // __x86_64__
573 _S_store<__todo>(__bools, __mem + __offset);
574 }
575 });
576 }
577 else if constexpr (__have_sse2 && _Np > 7)
578 __execute_n_times<__div_roundup(_Np, 16)>([&](auto __i) {
579 constexpr int __offset = __i * 16;
580 constexpr int __todo = std::min(16, int(_Np) - __offset);
581 const int __bits = __x.template _M_extract<__offset>()._M_to_bits();
582 __vector_type16_t<_UChar> __bools;
583 if constexpr (__have_avx512f)
584 {
585 auto __as32bits
586 = _mm512_maskz_mov_epi32(__bits, __to_intrin(
587 __vector_broadcast<16>(1)));
588 auto __as16bits
589 = __xzyw(_mm256_packs_epi32(__lo256(__as32bits),
590 __todo > 8 ? __hi256(__as32bits)
591 : __m256i()));
592 __bools = __vector_bitcast<_UChar>(
593 _mm_packs_epi16(__lo128(__as16bits), __hi128(__as16bits)));
594 }
595 else
596 {
597 using _V = __vector_type_t<_UChar, 16>;
598 auto __tmp = _mm_cvtsi32_si128(__bits);
599 __tmp = _mm_unpacklo_epi8(__tmp, __tmp);
600 __tmp = _mm_unpacklo_epi16(__tmp, __tmp);
601 __tmp = _mm_unpacklo_epi32(__tmp, __tmp);
602 _V __tmp2 = reinterpret_cast<_V>(__tmp);
603 __tmp2 &= _V{1, 2, 4, 8, 16, 32, 64, 128,
604 1, 2, 4, 8, 16, 32, 64, 128}; // mask bit index
605 __bools = (__tmp2 == 0) + 1; // 0xff -> 0x00 | 0x00 -> 0x01
606 }
607 _S_store<__todo>(__bools, __mem + __offset);
608 });
609 else
610 _CommonImplBuiltin::_S_store_bool_array(__x, __mem);
611 }
612
613 // }}}
614 // _S_blend_avx512 {{{
615 // Returns: __k ? __b : __a
616 // TODO: reverse __a and __b to match COND_EXPR
617 // Requires: _TV to be a __vector_type_t matching valuetype for the bitmask
618 // __k
619 template <typename _Kp, typename _TV>
620 _GLIBCXX_SIMD_INTRINSIC static _TV
621 _S_blend_avx512(const _Kp __k, const _TV __a, const _TV __b) noexcept
622 {
623#ifdef __clang__
624 // FIXME: this does a boolean choice, not a blend
625 return __k ? __a : __b;
626#else
627 static_assert(__is_vector_type_v<_TV>);
628 using _Tp = typename _VectorTraits<_TV>::value_type;
629 static_assert(sizeof(_TV) >= 16);
630 static_assert(sizeof(_Tp) <= 8);
631 using _IntT
632 = conditional_t<(sizeof(_Tp) > 2),
633 conditional_t<sizeof(_Tp) == 4, int, long long>,
634 conditional_t<sizeof(_Tp) == 1, char, short>>;
635 [[maybe_unused]] const auto __aa = __vector_bitcast<_IntT>(__a);
636 [[maybe_unused]] const auto __bb = __vector_bitcast<_IntT>(__b);
637 if constexpr (sizeof(_TV) == 64)
638 {
639 if constexpr (sizeof(_Tp) == 1)
640 return reinterpret_cast<_TV>(
641 __builtin_ia32_blendmb_512_mask(__aa, __bb, __k));
642 else if constexpr (sizeof(_Tp) == 2)
643 return reinterpret_cast<_TV>(
644 __builtin_ia32_blendmw_512_mask(__aa, __bb, __k));
645 else if constexpr (sizeof(_Tp) == 4 && is_floating_point_v<_Tp>)
646 return __builtin_ia32_blendmps_512_mask(__a, __b, __k);
647 else if constexpr (sizeof(_Tp) == 4)
648 return reinterpret_cast<_TV>(
649 __builtin_ia32_blendmd_512_mask(__aa, __bb, __k));
650 else if constexpr (sizeof(_Tp) == 8 && is_floating_point_v<_Tp>)
651 return __builtin_ia32_blendmpd_512_mask(__a, __b, __k);
652 else if constexpr (sizeof(_Tp) == 8)
653 return reinterpret_cast<_TV>(
654 __builtin_ia32_blendmq_512_mask(__aa, __bb, __k));
655 }
656 else if constexpr (sizeof(_TV) == 32)
657 {
658 if constexpr (sizeof(_Tp) == 1)
659 return reinterpret_cast<_TV>(
660 __builtin_ia32_blendmb_256_mask(__aa, __bb, __k));
661 else if constexpr (sizeof(_Tp) == 2)
662 return reinterpret_cast<_TV>(
663 __builtin_ia32_blendmw_256_mask(__aa, __bb, __k));
664 else if constexpr (sizeof(_Tp) == 4 && is_floating_point_v<_Tp>)
665 return __builtin_ia32_blendmps_256_mask(__a, __b, __k);
666 else if constexpr (sizeof(_Tp) == 4)
667 return reinterpret_cast<_TV>(
668 __builtin_ia32_blendmd_256_mask(__aa, __bb, __k));
669 else if constexpr (sizeof(_Tp) == 8 && is_floating_point_v<_Tp>)
670 return __builtin_ia32_blendmpd_256_mask(__a, __b, __k);
671 else if constexpr (sizeof(_Tp) == 8)
672 return reinterpret_cast<_TV>(
673 __builtin_ia32_blendmq_256_mask(__aa, __bb, __k));
674 }
675 else if constexpr (sizeof(_TV) == 16)
676 {
677 if constexpr (sizeof(_Tp) == 1)
678 return reinterpret_cast<_TV>(
679 __builtin_ia32_blendmb_128_mask(__aa, __bb, __k));
680 else if constexpr (sizeof(_Tp) == 2)
681 return reinterpret_cast<_TV>(
682 __builtin_ia32_blendmw_128_mask(__aa, __bb, __k));
683 else if constexpr (sizeof(_Tp) == 4 && is_floating_point_v<_Tp>)
684 return __builtin_ia32_blendmps_128_mask(__a, __b, __k);
685 else if constexpr (sizeof(_Tp) == 4)
686 return reinterpret_cast<_TV>(
687 __builtin_ia32_blendmd_128_mask(__aa, __bb, __k));
688 else if constexpr (sizeof(_Tp) == 8 && is_floating_point_v<_Tp>)
689 return __builtin_ia32_blendmpd_128_mask(__a, __b, __k);
690 else if constexpr (sizeof(_Tp) == 8)
691 return reinterpret_cast<_TV>(
692 __builtin_ia32_blendmq_128_mask(__aa, __bb, __k));
693 }
694#endif
695 }
696
697 // }}}
698 // _S_blend_intrin {{{
699 // Returns: __k ? __b : __a
700 // TODO: reverse __a and __b to match COND_EXPR
701 // Requires: _Tp to be an intrinsic type (integers blend per byte) and 16/32
702 // Bytes wide
703 template <typename _Tp>
704 _GLIBCXX_SIMD_INTRINSIC static _Tp _S_blend_intrin(_Tp __k, _Tp __a,
705 _Tp __b) noexcept
706 {
707 static_assert(is_same_v<decltype(__to_intrin(__a)), _Tp>);
708 constexpr struct
709 {
710 _GLIBCXX_SIMD_INTRINSIC __m128 operator()(__m128 __a, __m128 __b,
711 __m128 __k) const noexcept
712 {
713 return __builtin_ia32_blendvps(__a, __b, __k);
714 }
715 _GLIBCXX_SIMD_INTRINSIC __m128d operator()(__m128d __a, __m128d __b,
716 __m128d __k) const noexcept
717 {
718 return __builtin_ia32_blendvpd(__a, __b, __k);
719 }
720 _GLIBCXX_SIMD_INTRINSIC __m128i operator()(__m128i __a, __m128i __b,
721 __m128i __k) const noexcept
722 {
723 return reinterpret_cast<__m128i>(
724 __builtin_ia32_pblendvb128(reinterpret_cast<__v16qi>(__a),
725 reinterpret_cast<__v16qi>(__b),
726 reinterpret_cast<__v16qi>(__k)));
727 }
728 _GLIBCXX_SIMD_INTRINSIC __m256 operator()(__m256 __a, __m256 __b,
729 __m256 __k) const noexcept
730 {
731 return __builtin_ia32_blendvps256(__a, __b, __k);
732 }
733 _GLIBCXX_SIMD_INTRINSIC __m256d operator()(__m256d __a, __m256d __b,
734 __m256d __k) const noexcept
735 {
736 return __builtin_ia32_blendvpd256(__a, __b, __k);
737 }
738 _GLIBCXX_SIMD_INTRINSIC __m256i operator()(__m256i __a, __m256i __b,
739 __m256i __k) const noexcept
740 {
741 if constexpr (__have_avx2)
742 return reinterpret_cast<__m256i>(
743 __builtin_ia32_pblendvb256(reinterpret_cast<__v32qi>(__a),
744 reinterpret_cast<__v32qi>(__b),
745 reinterpret_cast<__v32qi>(__k)));
746 else
747 return reinterpret_cast<__m256i>(
748 __builtin_ia32_blendvps256(reinterpret_cast<__v8sf>(__a),
749 reinterpret_cast<__v8sf>(__b),
750 reinterpret_cast<__v8sf>(__k)));
751 }
752 } __eval;
753 return __eval(__a, __b, __k);
754 }
755
756 // }}}
757 // _S_blend {{{
758 // Returns: __k ? __at1 : __at0
759 // TODO: reverse __at0 and __at1 to match COND_EXPR
760 template <typename _Tp, size_t _Np>
761 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
762 _S_blend(_SimdWrapper<bool, _Np> __k, _SimdWrapper<_Tp, _Np> __at0,
763 _SimdWrapper<_Tp, _Np> __at1)
764 {
765 static_assert(is_same_v<_Tp, _Tp> && __have_avx512f);
766 if (__k._M_is_constprop() && __at0._M_is_constprop()
767 && __at1._M_is_constprop())
768 return __generate_from_n_evaluations<_Np,
769 __vector_type_t<_Tp, _Np>>([&](
770 auto __i) constexpr { return __k[__i] ? __at1[__i] : __at0[__i]; });
771 else if constexpr (sizeof(__at0) == 64
772 || (__have_avx512vl && sizeof(__at0) >= 16))
773 return _S_blend_avx512(__k._M_data, __at0._M_data, __at1._M_data);
774 else
775 {
776 static_assert((__have_avx512vl && sizeof(__at0) < 16)
777 || !__have_avx512vl);
778 constexpr size_t __size = (__have_avx512vl ? 16 : 64) / sizeof(_Tp);
779 return __vector_bitcast<_Tp, _Np>(
780 _S_blend_avx512(__k._M_data, __vector_bitcast<_Tp, __size>(__at0),
781 __vector_bitcast<_Tp, __size>(__at1)));
782 }
783 }
784
785 template <typename _Tp, size_t _Np>
786 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
787 _S_blend(_SimdWrapper<__int_for_sizeof_t<_Tp>, _Np> __k,
788 _SimdWrapper<_Tp, _Np> __at0, _SimdWrapper<_Tp, _Np> __at1)
789 {
790 const auto __kk = __wrapper_bitcast<_Tp>(__k);
791 if (__builtin_is_constant_evaluated()
792 || (__kk._M_is_constprop() && __at0._M_is_constprop()
793 && __at1._M_is_constprop()))
794 {
795 auto __r = __or(__andnot(__kk, __at0), __and(__kk, __at1));
796 if (__r._M_is_constprop())
797 return __r;
798 }
799 if constexpr (((__have_avx512f && sizeof(__at0) == 64) || __have_avx512vl)
800 && (sizeof(_Tp) >= 4 || __have_avx512bw))
801 // convert to bitmask and call overload above
802 return _S_blend(
803 _SimdWrapper<bool, _Np>(
804 __make_dependent_t<_Tp, _MaskImplX86Mixin>::_S_to_bits(__k)
805 ._M_to_bits()),
806 __at0, __at1);
807 else
808 {
809 // Since GCC does not assume __k to be a mask, using the builtin
810 // conditional operator introduces an extra compare against 0 before
811 // blending. So we rather call the intrinsic here.
812 if constexpr (__have_sse4_1)
813 return _S_blend_intrin(__to_intrin(__kk), __to_intrin(__at0),
814 __to_intrin(__at1));
815 else
816 return __or(__andnot(__kk, __at0), __and(__kk, __at1));
817 }
818 }
819
820 // }}}
821};
822
823// }}}
824// _SimdImplX86 {{{
825template <typename _Abi, typename>
826 struct _SimdImplX86 : _SimdImplBuiltin<_Abi>
827 {
828 using _Base = _SimdImplBuiltin<_Abi>;
829
830 template <typename _Tp>
831 using _MaskMember = typename _Base::template _MaskMember<_Tp>;
832
833 template <typename _Tp>
834 static constexpr size_t _S_full_size = _Abi::template _S_full_size<_Tp>;
835
836 template <typename _Tp>
837 static constexpr size_t _S_size = _Abi::template _S_size<_Tp>;
838
839 template <typename _Tp>
840 static constexpr size_t _S_max_store_size
841 = (sizeof(_Tp) >= 4 && __have_avx512f) || __have_avx512bw ? 64
842 : (is_floating_point_v<_Tp>&& __have_avx) || __have_avx2 ? 32
843 : 16;
844 using _MaskImpl = typename _Abi::_MaskImpl;
845
846 // _S_masked_load {{{
847 template <typename _Tp, size_t _Np, typename _Up>
848 static inline _SimdWrapper<_Tp, _Np>
849 _S_masked_load(_SimdWrapper<_Tp, _Np> __merge, _MaskMember<_Tp> __k,
850 const _Up* __mem) noexcept
851 {
852 static_assert(_Np == _S_size<_Tp>);
853 if constexpr (is_same_v<_Tp, _Up> || // no conversion
854 (sizeof(_Tp) == sizeof(_Up)
855 && is_integral_v<
856 _Tp> == is_integral_v<_Up>) // conversion via bit
857 // reinterpretation
858 )
859 {
860 [[maybe_unused]] const auto __intrin = __to_intrin(__merge);
861 if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512bw_vl)
862 && sizeof(_Tp) == 1)
863 {
864 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits();
865 if constexpr (sizeof(__intrin) == 16)
866 __merge = __vector_bitcast<_Tp, _Np>(
867 _mm_mask_loadu_epi8(__intrin, __kk, __mem));
868 else if constexpr (sizeof(__merge) == 32)
869 __merge = __vector_bitcast<_Tp, _Np>(
870 _mm256_mask_loadu_epi8(__intrin, __kk, __mem));
871 else if constexpr (sizeof(__merge) == 64)
872 __merge = __vector_bitcast<_Tp, _Np>(
873 _mm512_mask_loadu_epi8(__intrin, __kk, __mem));
874 else
875 __assert_unreachable<_Tp>();
876 }
877 else if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512bw_vl)
878 && sizeof(_Tp) == 2)
879 {
880 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits();
881 if constexpr (sizeof(__intrin) == 16)
882 __merge = __vector_bitcast<_Tp, _Np>(
883 _mm_mask_loadu_epi16(__intrin, __kk, __mem));
884 else if constexpr (sizeof(__intrin) == 32)
885 __merge = __vector_bitcast<_Tp, _Np>(
886 _mm256_mask_loadu_epi16(__intrin, __kk, __mem));
887 else if constexpr (sizeof(__intrin) == 64)
888 __merge = __vector_bitcast<_Tp, _Np>(
889 _mm512_mask_loadu_epi16(__intrin, __kk, __mem));
890 else
891 __assert_unreachable<_Tp>();
892 }
893 else if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512vl)
894 && sizeof(_Tp) == 4 && is_integral_v<_Up>)
895 {
896 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits();
897 if constexpr (sizeof(__intrin) == 16)
898 __merge = __vector_bitcast<_Tp, _Np>(
899 _mm_mask_loadu_epi32(__intrin, __kk, __mem));
900 else if constexpr (sizeof(__intrin) == 32)
901 __merge = __vector_bitcast<_Tp, _Np>(
902 _mm256_mask_loadu_epi32(__intrin, __kk, __mem));
903 else if constexpr (sizeof(__intrin) == 64)
904 __merge = __vector_bitcast<_Tp, _Np>(
905 _mm512_mask_loadu_epi32(__intrin, __kk, __mem));
906 else
907 __assert_unreachable<_Tp>();
908 }
909 else if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512vl)
910 && sizeof(_Tp) == 4 && is_floating_point_v<_Up>)
911 {
912 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits();
913 if constexpr (sizeof(__intrin) == 16)
914 __merge = __vector_bitcast<_Tp, _Np>(
915 _mm_mask_loadu_ps(__intrin, __kk, __mem));
916 else if constexpr (sizeof(__intrin) == 32)
917 __merge = __vector_bitcast<_Tp, _Np>(
918 _mm256_mask_loadu_ps(__intrin, __kk, __mem));
919 else if constexpr (sizeof(__intrin) == 64)
920 __merge = __vector_bitcast<_Tp, _Np>(
921 _mm512_mask_loadu_ps(__intrin, __kk, __mem));
922 else
923 __assert_unreachable<_Tp>();
924 }
925 else if constexpr (__have_avx2 && sizeof(_Tp) == 4
926 && is_integral_v<_Up>)
927 {
928 static_assert(sizeof(__intrin) == 16 || sizeof(__intrin) == 32);
929 __merge
930 = __or(__andnot(__vector_bitcast<_Tp>(__k), __merge._M_data),
931 __vector_bitcast<_Tp, _Np>(
932 __maskload_epi32(reinterpret_cast<const int*>(__mem),
933 __to_intrin(__k))));
934 }
935 else if constexpr (__have_avx && sizeof(_Tp) == 4)
936 {
937 static_assert(sizeof(__intrin) == 16 || sizeof(__intrin) == 32);
938 __merge
939 = __or(__andnot(__vector_bitcast<_Tp>(__k), __merge._M_data),
940 __vector_bitcast<_Tp, _Np>(
941 __maskload_ps(reinterpret_cast<const float*>(__mem),
942 __to_intrin(__k))));
943 }
944 else if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512vl)
945 && sizeof(_Tp) == 8 && is_integral_v<_Up>)
946 {
947 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits();
948 if constexpr (sizeof(__intrin) == 16)
949 __merge = __vector_bitcast<_Tp, _Np>(
950 _mm_mask_loadu_epi64(__intrin, __kk, __mem));
951 else if constexpr (sizeof(__intrin) == 32)
952 __merge = __vector_bitcast<_Tp, _Np>(
953 _mm256_mask_loadu_epi64(__intrin, __kk, __mem));
954 else if constexpr (sizeof(__intrin) == 64)
955 __merge = __vector_bitcast<_Tp, _Np>(
956 _mm512_mask_loadu_epi64(__intrin, __kk, __mem));
957 else
958 __assert_unreachable<_Tp>();
959 }
960 else if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512vl)
961 && sizeof(_Tp) == 8 && is_floating_point_v<_Up>)
962 {
963 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits();
964 if constexpr (sizeof(__intrin) == 16)
965 __merge = __vector_bitcast<_Tp, _Np>(
966 _mm_mask_loadu_pd(__intrin, __kk, __mem));
967 else if constexpr (sizeof(__intrin) == 32)
968 __merge = __vector_bitcast<_Tp, _Np>(
969 _mm256_mask_loadu_pd(__intrin, __kk, __mem));
970 else if constexpr (sizeof(__intrin) == 64)
971 __merge = __vector_bitcast<_Tp, _Np>(
972 _mm512_mask_loadu_pd(__intrin, __kk, __mem));
973 else
974 __assert_unreachable<_Tp>();
975 }
976 else if constexpr (__have_avx2 && sizeof(_Tp) == 8
977 && is_integral_v<_Up>)
978 {
979 static_assert(sizeof(__intrin) == 16 || sizeof(__intrin) == 32);
980 __merge
981 = __or(__andnot(__vector_bitcast<_Tp>(__k), __merge._M_data),
982 __vector_bitcast<_Tp, _Np>(__maskload_epi64(
983 reinterpret_cast<const _LLong*>(__mem),
984 __to_intrin(__k))));
985 }
986 else if constexpr (__have_avx && sizeof(_Tp) == 8)
987 {
988 static_assert(sizeof(__intrin) == 16 || sizeof(__intrin) == 32);
989 __merge
990 = __or(__andnot(__vector_bitcast<_Tp>(__k), __merge._M_data),
991 __vector_bitcast<_Tp, _Np>(
992 __maskload_pd(reinterpret_cast<const double*>(__mem),
993 __to_intrin(__k))));
994 }
995 else
996 _BitOps::_S_bit_iteration(_MaskImpl::_S_to_bits(__k),
997 [&](auto __i) {
998 __merge._M_set(__i, static_cast<_Tp>(
999 __mem[__i]));
1000 });
1001 }
1002 /* Very uncertain, that the following improves anything. Needs
1003 benchmarking
1004 * before it's activated.
1005 else if constexpr (sizeof(_Up) <= 8 && // no long double
1006 !__converts_via_decomposition_v<
1007 _Up, _Tp,
1008 sizeof(__merge)> // conversion via decomposition
1009 // is better handled via the
1010 // bit_iteration fallback below
1011 )
1012 {
1013 // TODO: copy pattern from _S_masked_store, which doesn't resort to
1014 // fixed_size
1015 using _Ap = simd_abi::deduce_t<_Up, _Np>;
1016 using _ATraits = _SimdTraits<_Up, _Ap>;
1017 using _AImpl = typename _ATraits::_SimdImpl;
1018 typename _ATraits::_SimdMember __uncvted{};
1019 typename _ATraits::_MaskMember __kk = _Ap::_MaskImpl::template
1020 _S_convert<_Up>(__k);
1021 __uncvted = _AImpl::_S_masked_load(__uncvted, __kk, __mem);
1022 _SimdConverter<_Up, _Ap, _Tp, _Abi> __converter;
1023 _Base::_S_masked_assign(__k, __merge, __converter(__uncvted));
1024 }
1025 */
1026 else
1027 __merge = _Base::_S_masked_load(__merge, __k, __mem);
1028 return __merge;
1029 }
1030
1031 // }}}
1032 // _S_masked_store_nocvt {{{
1033 template <typename _Tp, size_t _Np>
1034 _GLIBCXX_SIMD_INTRINSIC static void
1035 _S_masked_store_nocvt(_SimdWrapper<_Tp, _Np> __v, _Tp* __mem,
1036 _SimdWrapper<bool, _Np> __k)
1037 {
1038 [[maybe_unused]] const auto __vi = __to_intrin(__v);
1039 if constexpr (sizeof(__vi) == 64)
1040 {
1041 static_assert(sizeof(__v) == 64 && __have_avx512f);
1042 if constexpr (__have_avx512bw && sizeof(_Tp) == 1)
1043 _mm512_mask_storeu_epi8(__mem, __k, __vi);
1044 else if constexpr (__have_avx512bw && sizeof(_Tp) == 2)
1045 _mm512_mask_storeu_epi16(__mem, __k, __vi);
1046 else if constexpr (__have_avx512f && sizeof(_Tp) == 4)
1047 {
1048 if constexpr (is_integral_v<_Tp>)
1049 _mm512_mask_storeu_epi32(__mem, __k, __vi);
1050 else
1051 _mm512_mask_storeu_ps(__mem, __k, __vi);
1052 }
1053 else if constexpr (__have_avx512f && sizeof(_Tp) == 8)
1054 {
1055 if constexpr (is_integral_v<_Tp>)
1056 _mm512_mask_storeu_epi64(__mem, __k, __vi);
1057 else
1058 _mm512_mask_storeu_pd(__mem, __k, __vi);
1059 }
1060#if 0 // with KNL either sizeof(_Tp) >= 4 or sizeof(_vi) <= 32
1061 // with Skylake-AVX512, __have_avx512bw is true
1062 else if constexpr (__have_sse2)
1063 {
1064 using _M = __vector_type_t<_Tp, _Np>;
1065 using _MVT = _VectorTraits<_M>;
1066 _mm_maskmoveu_si128(__auto_bitcast(__extract<0, 4>(__v._M_data)),
1067 __auto_bitcast(_MaskImpl::template _S_convert<_Tp, _Np>(__k._M_data)),
1068 reinterpret_cast<char*>(__mem));
1069 _mm_maskmoveu_si128(__auto_bitcast(__extract<1, 4>(__v._M_data)),
1070 __auto_bitcast(_MaskImpl::template _S_convert<_Tp, _Np>(
1071 __k._M_data >> 1 * _MVT::_S_full_size)),
1072 reinterpret_cast<char*>(__mem) + 1 * 16);
1073 _mm_maskmoveu_si128(__auto_bitcast(__extract<2, 4>(__v._M_data)),
1074 __auto_bitcast(_MaskImpl::template _S_convert<_Tp, _Np>(
1075 __k._M_data >> 2 * _MVT::_S_full_size)),
1076 reinterpret_cast<char*>(__mem) + 2 * 16);
1077 if constexpr (_Np > 48 / sizeof(_Tp))
1078 _mm_maskmoveu_si128(
1079 __auto_bitcast(__extract<3, 4>(__v._M_data)),
1080 __auto_bitcast(_MaskImpl::template _S_convert<_Tp, _Np>(
1081 __k._M_data >> 3 * _MVT::_S_full_size)),
1082 reinterpret_cast<char*>(__mem) + 3 * 16);
1083 }
1084#endif
1085 else
1086 __assert_unreachable<_Tp>();
1087 }
1088 else if constexpr (sizeof(__vi) == 32)
1089 {
1090 if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 1)
1091 _mm256_mask_storeu_epi8(__mem, __k, __vi);
1092 else if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 2)
1093 _mm256_mask_storeu_epi16(__mem, __k, __vi);
1094 else if constexpr (__have_avx512vl && sizeof(_Tp) == 4)
1095 {
1096 if constexpr (is_integral_v<_Tp>)
1097 _mm256_mask_storeu_epi32(__mem, __k, __vi);
1098 else
1099 _mm256_mask_storeu_ps(__mem, __k, __vi);
1100 }
1101 else if constexpr (__have_avx512vl && sizeof(_Tp) == 8)
1102 {
1103 if constexpr (is_integral_v<_Tp>)
1104 _mm256_mask_storeu_epi64(__mem, __k, __vi);
1105 else
1106 _mm256_mask_storeu_pd(__mem, __k, __vi);
1107 }
1108 else if constexpr (__have_avx512f
1109 && (sizeof(_Tp) >= 4 || __have_avx512bw))
1110 {
1111 // use a 512-bit maskstore, using zero-extension of the bitmask
1112 _S_masked_store_nocvt(
1113 _SimdWrapper64<_Tp>(
1114 __intrin_bitcast<__vector_type64_t<_Tp>>(__v._M_data)),
1115 __mem, _SimdWrapper<bool, 64 / sizeof(_Tp)>(__k._M_data));
1116 }
1117 else
1118 _S_masked_store_nocvt(__v, __mem,
1119 _MaskImpl::template _S_to_maskvector<
1120 __int_for_sizeof_t<_Tp>, _Np>(__k));
1121 }
1122 else if constexpr (sizeof(__vi) == 16)
1123 {
1124 if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 1)
1125 _mm_mask_storeu_epi8(__mem, __k, __vi);
1126 else if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 2)
1127 _mm_mask_storeu_epi16(__mem, __k, __vi);
1128 else if constexpr (__have_avx512vl && sizeof(_Tp) == 4)
1129 {
1130 if constexpr (is_integral_v<_Tp>)
1131 _mm_mask_storeu_epi32(__mem, __k, __vi);
1132 else
1133 _mm_mask_storeu_ps(__mem, __k, __vi);
1134 }
1135 else if constexpr (__have_avx512vl && sizeof(_Tp) == 8)
1136 {
1137 if constexpr (is_integral_v<_Tp>)
1138 _mm_mask_storeu_epi64(__mem, __k, __vi);
1139 else
1140 _mm_mask_storeu_pd(__mem, __k, __vi);
1141 }
1142 else if constexpr (__have_avx512f
1143 && (sizeof(_Tp) >= 4 || __have_avx512bw))
1144 {
1145 // use a 512-bit maskstore, using zero-extension of the bitmask
1146 _S_masked_store_nocvt(
1147 _SimdWrapper64<_Tp>(
1148 __intrin_bitcast<__intrinsic_type64_t<_Tp>>(__v._M_data)),
1149 __mem, _SimdWrapper<bool, 64 / sizeof(_Tp)>(__k._M_data));
1150 }
1151 else
1152 _S_masked_store_nocvt(__v, __mem,
1153 _MaskImpl::template _S_to_maskvector<
1154 __int_for_sizeof_t<_Tp>, _Np>(__k));
1155 }
1156 else
1157 __assert_unreachable<_Tp>();
1158 }
1159
1160 template <typename _Tp, size_t _Np>
1161 _GLIBCXX_SIMD_INTRINSIC static void
1162 _S_masked_store_nocvt(_SimdWrapper<_Tp, _Np> __v, _Tp* __mem,
1163 _SimdWrapper<__int_for_sizeof_t<_Tp>, _Np> __k)
1164 {
1165 if constexpr (sizeof(__v) <= 16)
1166 {
1167 [[maybe_unused]] const auto __vi
1168 = __intrin_bitcast<__m128i>(__as_vector(__v));
1169 [[maybe_unused]] const auto __ki
1170 = __intrin_bitcast<__m128i>(__as_vector(__k));
1171 if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 1)
1172 _mm_mask_storeu_epi8(__mem, _mm_movepi8_mask(__ki), __vi);
1173 else if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 2)
1174 _mm_mask_storeu_epi16(__mem, _mm_movepi16_mask(__ki), __vi);
1175 else if constexpr (__have_avx2 && sizeof(_Tp) == 4
1176 && is_integral_v<_Tp>)
1177 _mm_maskstore_epi32(reinterpret_cast<int*>(__mem), __ki, __vi);
1178 else if constexpr (__have_avx && sizeof(_Tp) == 4)
1179 _mm_maskstore_ps(reinterpret_cast<float*>(__mem), __ki,
1180 __vector_bitcast<float>(__vi));
1181 else if constexpr (__have_avx2 && sizeof(_Tp) == 8
1182 && is_integral_v<_Tp>)
1183 _mm_maskstore_epi64(reinterpret_cast<_LLong*>(__mem), __ki, __vi);
1184 else if constexpr (__have_avx && sizeof(_Tp) == 8)
1185 _mm_maskstore_pd(reinterpret_cast<double*>(__mem), __ki,
1186 __vector_bitcast<double>(__vi));
1187 else if constexpr (__have_sse2)
1188 _mm_maskmoveu_si128(__vi, __ki, reinterpret_cast<char*>(__mem));
1189 }
1190 else if constexpr (sizeof(__v) == 32)
1191 {
1192 [[maybe_unused]] const auto __vi
1193 = __intrin_bitcast<__m256i>(__as_vector(__v));
1194 [[maybe_unused]] const auto __ki
1195 = __intrin_bitcast<__m256i>(__as_vector(__k));
1196 if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 1)
1197 _mm256_mask_storeu_epi8(__mem, _mm256_movepi8_mask(__ki), __vi);
1198 else if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 2)
1199 _mm256_mask_storeu_epi16(__mem, _mm256_movepi16_mask(__ki), __vi);
1200 else if constexpr (__have_avx2 && sizeof(_Tp) == 4
1201 && is_integral_v<_Tp>)
1202 _mm256_maskstore_epi32(reinterpret_cast<int*>(__mem), __ki, __vi);
1203 else if constexpr (sizeof(_Tp) == 4)
1204 _mm256_maskstore_ps(reinterpret_cast<float*>(__mem), __ki,
1205 __vector_bitcast<float>(__v));
1206 else if constexpr (__have_avx2 && sizeof(_Tp) == 8
1207 && is_integral_v<_Tp>)
1208 _mm256_maskstore_epi64(reinterpret_cast<_LLong*>(__mem), __ki,
1209 __vi);
1210 else if constexpr (__have_avx && sizeof(_Tp) == 8)
1211 _mm256_maskstore_pd(reinterpret_cast<double*>(__mem), __ki,
1212 __vector_bitcast<double>(__v));
1213 else if constexpr (__have_sse2)
1214 {
1215 _mm_maskmoveu_si128(__lo128(__vi), __lo128(__ki),
1216 reinterpret_cast<char*>(__mem));
1217 _mm_maskmoveu_si128(__hi128(__vi), __hi128(__ki),
1218 reinterpret_cast<char*>(__mem) + 16);
1219 }
1220 }
1221 else
1222 __assert_unreachable<_Tp>();
1223 }
1224
1225 // }}}
1226 // _S_masked_store {{{
1227 template <typename _Tp, size_t _Np, typename _Up>
1228 _GLIBCXX_SIMD_INTRINSIC static void
1229 _S_masked_store(const _SimdWrapper<_Tp, _Np> __v, _Up* __mem,
1230 const _MaskMember<_Tp> __k) noexcept
1231 {
1232 if constexpr (is_integral_v<
1233 _Tp> && is_integral_v<_Up> && sizeof(_Tp) > sizeof(_Up)
1234 && __have_avx512f && (sizeof(_Tp) >= 4 || __have_avx512bw)
1235 && (sizeof(__v) == 64 || __have_avx512vl))
1236 { // truncating store
1237 const auto __vi = __to_intrin(__v);
1238 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits();
1239 if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 4
1240 && sizeof(__vi) == 64)
1241 _mm512_mask_cvtepi64_storeu_epi32(__mem, __kk, __vi);
1242 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 4
1243 && sizeof(__vi) == 32)
1244 _mm256_mask_cvtepi64_storeu_epi32(__mem, __kk, __vi);
1245 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 4
1246 && sizeof(__vi) == 16)
1247 _mm_mask_cvtepi64_storeu_epi32(__mem, __kk, __vi);
1248 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 2
1249 && sizeof(__vi) == 64)
1250 _mm512_mask_cvtepi64_storeu_epi16(__mem, __kk, __vi);
1251 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 2
1252 && sizeof(__vi) == 32)
1253 _mm256_mask_cvtepi64_storeu_epi16(__mem, __kk, __vi);
1254 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 2
1255 && sizeof(__vi) == 16)
1256 _mm_mask_cvtepi64_storeu_epi16(__mem, __kk, __vi);
1257 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 1
1258 && sizeof(__vi) == 64)
1259 _mm512_mask_cvtepi64_storeu_epi8(__mem, __kk, __vi);
1260 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 1
1261 && sizeof(__vi) == 32)
1262 _mm256_mask_cvtepi64_storeu_epi8(__mem, __kk, __vi);
1263 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 1
1264 && sizeof(__vi) == 16)
1265 _mm_mask_cvtepi64_storeu_epi8(__mem, __kk, __vi);
1266 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 2
1267 && sizeof(__vi) == 64)
1268 _mm512_mask_cvtepi32_storeu_epi16(__mem, __kk, __vi);
1269 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 2
1270 && sizeof(__vi) == 32)
1271 _mm256_mask_cvtepi32_storeu_epi16(__mem, __kk, __vi);
1272 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 2
1273 && sizeof(__vi) == 16)
1274 _mm_mask_cvtepi32_storeu_epi16(__mem, __kk, __vi);
1275 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 1
1276 && sizeof(__vi) == 64)
1277 _mm512_mask_cvtepi32_storeu_epi8(__mem, __kk, __vi);
1278 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 1
1279 && sizeof(__vi) == 32)
1280 _mm256_mask_cvtepi32_storeu_epi8(__mem, __kk, __vi);
1281 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 1
1282 && sizeof(__vi) == 16)
1283 _mm_mask_cvtepi32_storeu_epi8(__mem, __kk, __vi);
1284 else if constexpr (sizeof(_Tp) == 2 && sizeof(_Up) == 1
1285 && sizeof(__vi) == 64)
1286 _mm512_mask_cvtepi16_storeu_epi8(__mem, __kk, __vi);
1287 else if constexpr (sizeof(_Tp) == 2 && sizeof(_Up) == 1
1288 && sizeof(__vi) == 32)
1289 _mm256_mask_cvtepi16_storeu_epi8(__mem, __kk, __vi);
1290 else if constexpr (sizeof(_Tp) == 2 && sizeof(_Up) == 1
1291 && sizeof(__vi) == 16)
1292 _mm_mask_cvtepi16_storeu_epi8(__mem, __kk, __vi);
1293 else
1294 __assert_unreachable<_Tp>();
1295 }
1296 else
1297 _Base::_S_masked_store(__v, __mem, __k);
1298 }
1299
1300 // }}}
1301 // _S_multiplies {{{
1302 template <typename _V, typename _VVT = _VectorTraits<_V>>
1303 _GLIBCXX_SIMD_INTRINSIC static constexpr _V _S_multiplies(_V __x, _V __y)
1304 {
1305 using _Tp = typename _VVT::value_type;
1306 if (__builtin_is_constant_evaluated() || __x._M_is_constprop()
1307 || __y._M_is_constprop())
1308 return __as_vector(__x) * __as_vector(__y);
1309 else if constexpr (sizeof(_Tp) == 1)
1310 {
1311 if constexpr (sizeof(_V) == 2)
1312 {
1313 const auto __xs = reinterpret_cast<short>(__x._M_data);
1314 const auto __ys = reinterpret_cast<short>(__y._M_data);
1315 return reinterpret_cast<__vector_type_t<_Tp, 2>>(short(
1316 ((__xs * __ys) & 0xff) | ((__xs >> 8) * (__ys & 0xff00))));
1317 }
1318 else if constexpr (sizeof(_V) == 4 && _VVT::_S_partial_width == 3)
1319 {
1320 const auto __xi = reinterpret_cast<int>(__x._M_data);
1321 const auto __yi = reinterpret_cast<int>(__y._M_data);
1322 return reinterpret_cast<__vector_type_t<_Tp, 3>>(
1323 ((__xi * __yi) & 0xff)
1324 | (((__xi >> 8) * (__yi & 0xff00)) & 0xff00)
1325 | ((__xi >> 16) * (__yi & 0xff0000)));
1326 }
1327 else if constexpr (sizeof(_V) == 4)
1328 {
1329 const auto __xi = reinterpret_cast<int>(__x._M_data);
1330 const auto __yi = reinterpret_cast<int>(__y._M_data);
1331 return reinterpret_cast<__vector_type_t<_Tp, 4>>(
1332 ((__xi * __yi) & 0xff)
1333 | (((__xi >> 8) * (__yi & 0xff00)) & 0xff00)
1334 | (((__xi >> 16) * (__yi & 0xff0000)) & 0xff0000)
1335 | ((__xi >> 24) * (__yi & 0xff000000u)));
1336 }
1337 else if constexpr (sizeof(_V) == 8 && __have_avx2
1338 && is_signed_v<_Tp>)
1339 return __convert<typename _VVT::type>(
1340 __vector_bitcast<short>(_mm_cvtepi8_epi16(__to_intrin(__x)))
1341 * __vector_bitcast<short>(_mm_cvtepi8_epi16(__to_intrin(__y))));
1342 else if constexpr (sizeof(_V) == 8 && __have_avx2
1343 && is_unsigned_v<_Tp>)
1344 return __convert<typename _VVT::type>(
1345 __vector_bitcast<short>(_mm_cvtepu8_epi16(__to_intrin(__x)))
1346 * __vector_bitcast<short>(_mm_cvtepu8_epi16(__to_intrin(__y))));
1347 else
1348 {
1349 // codegen of `x*y` is suboptimal (as of GCC 9.0.1)
1350 constexpr size_t __full_size = _VVT::_S_full_size;
1351 constexpr int _Np = sizeof(_V) >= 16 ? __full_size / 2 : 8;
1352 using _ShortW = _SimdWrapper<short, _Np>;
1353 const _ShortW __even = __vector_bitcast<short, _Np>(__x)
1354 * __vector_bitcast<short, _Np>(__y);
1355 _ShortW __high_byte = _ShortW()._M_data - 256;
1356 //[&]() { asm("" : "+x"(__high_byte._M_data)); }();
1357 const _ShortW __odd
1358 = (__vector_bitcast<short, _Np>(__x) >> 8)
1359 * (__vector_bitcast<short, _Np>(__y) & __high_byte._M_data);
1360 if constexpr (__have_avx512bw && sizeof(_V) > 2)
1361 return _CommonImplX86::_S_blend_avx512(
1362 0xaaaa'aaaa'aaaa'aaaaLL, __vector_bitcast<_Tp>(__even),
1363 __vector_bitcast<_Tp>(__odd));
1364 else if constexpr (__have_sse4_1 && sizeof(_V) > 2)
1365 return _CommonImplX86::_S_blend_intrin(__to_intrin(
1366 __high_byte),
1367 __to_intrin(__even),
1368 __to_intrin(__odd));
1369 else
1370 return __to_intrin(
1371 __or(__andnot(__high_byte, __even), __odd));
1372 }
1373 }
1374 else
1375 return _Base::_S_multiplies(__x, __y);
1376 }
1377
1378 // }}}
1379 // _S_divides {{{
1380#ifdef _GLIBCXX_SIMD_WORKAROUND_PR90993
1381 template <typename _Tp, size_t _Np>
1382 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1383 _S_divides(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1384 {
1385 if (!__builtin_is_constant_evaluated()
1386 && !__builtin_constant_p(__y._M_data))
1387 if constexpr (is_integral_v<_Tp> && sizeof(_Tp) <= 4)
1388 { // use divps - codegen of `x/y` is suboptimal (as of GCC 9.0.1)
1389 // Note that using floating-point division is likely to raise the
1390 // *Inexact* exception flag and thus appears like an invalid
1391 // "as-if" transformation. However, C++ doesn't specify how the
1392 // fpenv can be observed and points to C. C says that function
1393 // calls are assumed to potentially raise fp exceptions, unless
1394 // documented otherwise. Consequently, operator/, which is a
1395 // function call, may raise fp exceptions.
1396 /*const struct _CsrGuard
1397 {
1398 const unsigned _M_data = _mm_getcsr();
1399 _CsrGuard()
1400 {
1401 _mm_setcsr(0x9f80); // turn off FP exceptions and
1402 flush-to-zero
1403 }
1404 ~_CsrGuard() { _mm_setcsr(_M_data); }
1405 } __csr;*/
1406 using _Float = conditional_t<sizeof(_Tp) == 4, double, float>;
1407 constexpr size_t __n_intermediate
1408 = std::min(_Np, (__have_avx512f ? 64
1409 : __have_avx ? 32
1410 : 16)
1411 / sizeof(_Float));
1412 using _FloatV = __vector_type_t<_Float, __n_intermediate>;
1413 constexpr size_t __n_floatv
1414 = __div_roundup(_Np, __n_intermediate);
1415 using _R = __vector_type_t<_Tp, _Np>;
1416 const auto __xf = __convert_all<_FloatV, __n_floatv>(__x);
1417 const auto __yf = __convert_all<_FloatV, __n_floatv>(
1418 _Abi::__make_padding_nonzero(__as_vector(__y)));
1419 return __call_with_n_evaluations<__n_floatv>(
1420 [](auto... __quotients) {
1421 return __vector_convert<_R>(__quotients...);
1422 },
1423 [&__xf,
1424 &__yf](auto __i) -> _SimdWrapper<_Float, __n_intermediate> {
1425#if !defined __clang__ && __GCC_IEC_559 == 0
1426 // If -freciprocal-math is active, using the `/` operator is
1427 // incorrect because it may be translated to an imprecise
1428 // multiplication with reciprocal. We need to use inline
1429 // assembly to force a real division.
1430 _FloatV __r;
1431 if constexpr (__have_avx) // -mno-sse2avx is irrelevant
1432 // because once -mavx is given, GCC
1433 // emits VEX encoded vdivp[sd]
1434 {
1435 if constexpr (sizeof(_Tp) == 4)
1436 asm("vdivpd\t{%2, %1, %0|%0, %1, %2}"
1437 : "=x"(__r)
1438 : "x"(__xf[__i]), "x"(__yf[__i]));
1439 else
1440 asm("vdivps\t{%2, %1, %0|%0, %1, %2}"
1441 : "=x"(__r)
1442 : "x"(__xf[__i]), "x"(__yf[__i]));
1443 }
1444 else
1445 {
1446 __r = __xf[__i];
1447 if constexpr (sizeof(_Tp) == 4)
1448 asm("divpd\t{%1, %0|%0, %1}"
1449 : "=x"(__r)
1450 : "x"(__yf[__i]));
1451 else
1452 asm("divps\t{%1, %0|%0, %1}"
1453 : "=x"(__r)
1454 : "x"(__yf[__i]));
1455 }
1456 return __r;
1457#else
1458 return __xf[__i] / __yf[__i];
1459#endif
1460 });
1461 }
1462 /* 64-bit int division is potentially optimizable via double division if
1463 * the value in __x is small enough and the conversion between
1464 * int<->double is efficient enough:
1465 else if constexpr (is_integral_v<_Tp> && is_unsigned_v<_Tp> &&
1466 sizeof(_Tp) == 8)
1467 {
1468 if constexpr (__have_sse4_1 && sizeof(__x) == 16)
1469 {
1470 if (_mm_test_all_zeros(__x, __m128i{0xffe0'0000'0000'0000ull,
1471 0xffe0'0000'0000'0000ull}))
1472 {
1473 __x._M_data | 0x __vector_convert<__m128d>(__x._M_data)
1474 }
1475 }
1476 }
1477 */
1478 return _Base::_S_divides(__x, __y);
1479 }
1480 #endif // _GLIBCXX_SIMD_WORKAROUND_PR90993
1481
1482 // }}}
1483 // _S_modulus {{{
1484 template <typename _Tp, size_t _Np>
1485 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1486 _S_modulus(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1487 {
1488 if (__builtin_is_constant_evaluated()
1489 || __builtin_constant_p(__y._M_data) || sizeof(_Tp) >= 8)
1490 return _Base::_S_modulus(__x, __y);
1491 else
1492 return _Base::_S_minus(__x, _S_multiplies(__y, _S_divides(__x, __y)));
1493 }
1494
1495 // }}}
1496 // _S_bit_shift_left {{{
1497 // Notes on UB. C++2a [expr.shift] says:
1498 // -1- [...] The operands shall be of integral or unscoped enumeration type
1499 // and integral promotions are performed. The type of the result is that
1500 // of the promoted left operand. The behavior is undefined if the right
1501 // operand is negative, or greater than or equal to the width of the
1502 // promoted left operand.
1503 // -2- The value of E1 << E2 is the unique value congruent to E1×2^E2 modulo
1504 // 2^N, where N is the width of the type of the result.
1505 //
1506 // C++17 [expr.shift] says:
1507 // -2- The value of E1 << E2 is E1 left-shifted E2 bit positions; vacated
1508 // bits are zero-filled. If E1 has an unsigned type, the value of the
1509 // result is E1 × 2^E2 , reduced modulo one more than the maximum value
1510 // representable in the result type. Otherwise, if E1 has a signed type
1511 // and non-negative value, and E1 × 2^E2 is representable in the
1512 // corresponding unsigned type of the result type, then that value,
1513 // converted to the result type, is the resulting value; otherwise, the
1514 // behavior is undefined.
1515 //
1516 // Consequences:
1517 // With C++2a signed and unsigned types have the same UB
1518 // characteristics:
1519 // - left shift is not UB for 0 <= RHS < max(32, #bits(T))
1520 //
1521 // With C++17 there's little room for optimizations because the standard
1522 // requires all shifts to happen on promoted integrals (i.e. int). Thus,
1523 // short and char shifts must assume shifts affect bits of neighboring
1524 // values.
1525 #ifndef _GLIBCXX_SIMD_NO_SHIFT_OPT
1526 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
1527 inline _GLIBCXX_CONST static typename _TVT::type
1528 _S_bit_shift_left(_Tp __xx, int __y)
1529 {
1530 using _V = typename _TVT::type;
1531 using _Up = typename _TVT::value_type;
1532 _V __x = __xx;
1533 [[maybe_unused]] const auto __ix = __to_intrin(__x);
1534 if (__builtin_is_constant_evaluated())
1535 return __x << __y;
1536#if __cplusplus > 201703
1537 // after C++17, signed shifts have no UB, and behave just like unsigned
1538 // shifts
1539 else if constexpr (sizeof(_Up) == 1 && is_signed_v<_Up>)
1540 return __vector_bitcast<_Up>(
1541 _S_bit_shift_left(__vector_bitcast<make_unsigned_t<_Up>>(__x),
1542 __y));
1543#endif
1544 else if constexpr (sizeof(_Up) == 1)
1545 {
1546 // (cf. https://gcc.gnu.org/bugzilla/show_bug.cgi?id=83894)
1547 if (__builtin_constant_p(__y))
1548 {
1549 if (__y == 0)
1550 return __x;
1551 else if (__y == 1)
1552 return __x + __x;
1553 else if (__y == 2)
1554 {
1555 __x = __x + __x;
1556 return __x + __x;
1557 }
1558 else if (__y > 2 && __y < 8)
1559 {
1560 if constexpr (sizeof(__x) > sizeof(unsigned))
1561 {
1562 const _UChar __mask = 0xff << __y; // precomputed vector
1563 return __vector_bitcast<_Up>(
1564 __vector_bitcast<_UChar>(
1565 __vector_bitcast<unsigned>(__x) << __y)
1566 & __mask);
1567 }
1568 else
1569 {
1570 const unsigned __mask
1571 = (0xff & (0xff << __y)) * 0x01010101u;
1572 return reinterpret_cast<_V>(
1573 static_cast<__int_for_sizeof_t<_V>>(
1574 unsigned(
1575 reinterpret_cast<__int_for_sizeof_t<_V>>(__x)
1576 << __y)
1577 & __mask));
1578 }
1579 }
1580 else if (__y >= 8 && __y < 32)
1581 return _V();
1582 else
1583 __builtin_unreachable();
1584 }
1585 // general strategy in the following: use an sllv instead of sll
1586 // instruction, because it's 2 to 4 times faster:
1587 else if constexpr (__have_avx512bw_vl && sizeof(__x) == 16)
1588 return __vector_bitcast<_Up>(_mm256_cvtepi16_epi8(
1589 _mm256_sllv_epi16(_mm256_cvtepi8_epi16(__ix),
1590 _mm256_set1_epi16(__y))));
1591 else if constexpr (__have_avx512bw && sizeof(__x) == 32)
1592 return __vector_bitcast<_Up>(_mm512_cvtepi16_epi8(
1593 _mm512_sllv_epi16(_mm512_cvtepi8_epi16(__ix),
1594 _mm512_set1_epi16(__y))));
1595 else if constexpr (__have_avx512bw && sizeof(__x) == 64)
1596 {
1597 const auto __shift = _mm512_set1_epi16(__y);
1598 return __vector_bitcast<_Up>(
1599 __concat(_mm512_cvtepi16_epi8(_mm512_sllv_epi16(
1600 _mm512_cvtepi8_epi16(__lo256(__ix)), __shift)),
1601 _mm512_cvtepi16_epi8(_mm512_sllv_epi16(
1602 _mm512_cvtepi8_epi16(__hi256(__ix)), __shift))));
1603 }
1604 else if constexpr (__have_avx2 && sizeof(__x) == 32)
1605 {
1606#if 1
1607 const auto __shift = _mm_cvtsi32_si128(__y);
1608 auto __k
1609 = _mm256_sll_epi16(_mm256_slli_epi16(~__m256i(), 8), __shift);
1610 __k |= _mm256_srli_epi16(__k, 8);
1611 return __vector_bitcast<_Up>(_mm256_sll_epi32(__ix, __shift)
1612 & __k);
1613#else
1614 const _Up __k = 0xff << __y;
1615 return __vector_bitcast<_Up>(__vector_bitcast<int>(__x) << __y)
1616 & __k;
1617#endif
1618 }
1619 else
1620 {
1621 const auto __shift = _mm_cvtsi32_si128(__y);
1622 auto __k
1623 = _mm_sll_epi16(_mm_slli_epi16(~__m128i(), 8), __shift);
1624 __k |= _mm_srli_epi16(__k, 8);
1625 return __intrin_bitcast<_V>(_mm_sll_epi16(__ix, __shift) & __k);
1626 }
1627 }
1628 return __x << __y;
1629 }
1630
1631 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
1632 inline _GLIBCXX_CONST static typename _TVT::type
1633 _S_bit_shift_left(_Tp __xx, typename _TVT::type __y)
1634 {
1635 using _V = typename _TVT::type;
1636 using _Up = typename _TVT::value_type;
1637 _V __x = __xx;
1638 [[maybe_unused]] const auto __ix = __to_intrin(__x);
1639 [[maybe_unused]] const auto __iy = __to_intrin(__y);
1640 if (__builtin_is_constant_evaluated())
1641 return __x << __y;
1642#if __cplusplus > 201703
1643 // after C++17, signed shifts have no UB, and behave just like unsigned
1644 // shifts
1645 else if constexpr (is_signed_v<_Up>)
1646 return __vector_bitcast<_Up>(
1647 _S_bit_shift_left(__vector_bitcast<make_unsigned_t<_Up>>(__x),
1648 __vector_bitcast<make_unsigned_t<_Up>>(__y)));
1649#endif
1650 else if constexpr (sizeof(_Up) == 1)
1651 {
1652 if constexpr (sizeof __ix == 64 && __have_avx512bw)
1653 return __vector_bitcast<_Up>(__concat(
1654 _mm512_cvtepi16_epi8(
1655 _mm512_sllv_epi16(_mm512_cvtepu8_epi16(__lo256(__ix)),
1656 _mm512_cvtepu8_epi16(__lo256(__iy)))),
1657 _mm512_cvtepi16_epi8(
1658 _mm512_sllv_epi16(_mm512_cvtepu8_epi16(__hi256(__ix)),
1659 _mm512_cvtepu8_epi16(__hi256(__iy))))));
1660 else if constexpr (sizeof __ix == 32 && __have_avx512bw)
1661 return __vector_bitcast<_Up>(_mm512_cvtepi16_epi8(
1662 _mm512_sllv_epi16(_mm512_cvtepu8_epi16(__ix),
1663 _mm512_cvtepu8_epi16(__iy))));
1664 else if constexpr (sizeof __x <= 8 && __have_avx512bw_vl)
1665 return __intrin_bitcast<_V>(
1666 _mm_cvtepi16_epi8(_mm_sllv_epi16(_mm_cvtepu8_epi16(__ix),
1667 _mm_cvtepu8_epi16(__iy))));
1668 else if constexpr (sizeof __ix == 16 && __have_avx512bw_vl)
1669 return __intrin_bitcast<_V>(_mm256_cvtepi16_epi8(
1670 _mm256_sllv_epi16(_mm256_cvtepu8_epi16(__ix),
1671 _mm256_cvtepu8_epi16(__iy))));
1672 else if constexpr (sizeof __ix == 16 && __have_avx512bw)
1673 return __intrin_bitcast<_V>(
1674 __lo128(_mm512_cvtepi16_epi8(_mm512_sllv_epi16(
1675 _mm512_cvtepu8_epi16(_mm256_castsi128_si256(__ix)),
1676 _mm512_cvtepu8_epi16(_mm256_castsi128_si256(__iy))))));
1677 else if constexpr (__have_sse4_1 && sizeof(__x) == 16)
1678 {
1679 auto __mask
1680 = __vector_bitcast<_Up>(__vector_bitcast<short>(__y) << 5);
1681 auto __x4
1682 = __vector_bitcast<_Up>(__vector_bitcast<short>(__x) << 4);
1683 __x4 &= char(0xf0);
1684 __x = reinterpret_cast<_V>(_CommonImplX86::_S_blend_intrin(
1685 __to_intrin(__mask), __to_intrin(__x), __to_intrin(__x4)));
1686 __mask += __mask;
1687 auto __x2
1688 = __vector_bitcast<_Up>(__vector_bitcast<short>(__x) << 2);
1689 __x2 &= char(0xfc);
1690 __x = reinterpret_cast<_V>(_CommonImplX86::_S_blend_intrin(
1691 __to_intrin(__mask), __to_intrin(__x), __to_intrin(__x2)));
1692 __mask += __mask;
1693 auto __x1 = __x + __x;
1694 __x = reinterpret_cast<_V>(_CommonImplX86::_S_blend_intrin(
1695 __to_intrin(__mask), __to_intrin(__x), __to_intrin(__x1)));
1696 return __x
1697 & ((__y & char(0xf8)) == 0); // y > 7 nulls the result
1698 }
1699 else if constexpr (sizeof(__x) == 16)
1700 {
1701 auto __mask
1702 = __vector_bitcast<_UChar>(__vector_bitcast<short>(__y) << 5);
1703 auto __x4
1704 = __vector_bitcast<_Up>(__vector_bitcast<short>(__x) << 4);
1705 __x4 &= char(0xf0);
1706 __x = __vector_bitcast<_SChar>(__mask) < 0 ? __x4 : __x;
1707 __mask += __mask;
1708 auto __x2
1709 = __vector_bitcast<_Up>(__vector_bitcast<short>(__x) << 2);
1710 __x2 &= char(0xfc);
1711 __x = __vector_bitcast<_SChar>(__mask) < 0 ? __x2 : __x;
1712 __mask += __mask;
1713 auto __x1 = __x + __x;
1714 __x = __vector_bitcast<_SChar>(__mask) < 0 ? __x1 : __x;
1715 return __x
1716 & ((__y & char(0xf8)) == 0); // y > 7 nulls the result
1717 }
1718 else
1719 return __x << __y;
1720 }
1721 else if constexpr (sizeof(_Up) == 2)
1722 {
1723 if constexpr (sizeof __ix == 64 && __have_avx512bw)
1724 return __vector_bitcast<_Up>(_mm512_sllv_epi16(__ix, __iy));
1725 else if constexpr (sizeof __ix == 32 && __have_avx512bw_vl)
1726 return __vector_bitcast<_Up>(_mm256_sllv_epi16(__ix, __iy));
1727 else if constexpr (sizeof __ix == 32 && __have_avx512bw)
1728 return __vector_bitcast<_Up>(
1729 __lo256(_mm512_sllv_epi16(_mm512_castsi256_si512(__ix),
1730 _mm512_castsi256_si512(__iy))));
1731 else if constexpr (sizeof __ix == 32 && __have_avx2)
1732 {
1733 const auto __ux = __vector_bitcast<unsigned>(__x);
1734 const auto __uy = __vector_bitcast<unsigned>(__y);
1735 return __vector_bitcast<_Up>(_mm256_blend_epi16(
1736 __auto_bitcast(__ux << (__uy & 0x0000ffffu)),
1737 __auto_bitcast((__ux & 0xffff0000u) << (__uy >> 16)), 0xaa));
1738 }
1739 else if constexpr (sizeof __ix == 16 && __have_avx512bw_vl)
1740 return __intrin_bitcast<_V>(_mm_sllv_epi16(__ix, __iy));
1741 else if constexpr (sizeof __ix == 16 && __have_avx512bw)
1742 return __intrin_bitcast<_V>(
1743 __lo128(_mm512_sllv_epi16(_mm512_castsi128_si512(__ix),
1744 _mm512_castsi128_si512(__iy))));
1745 else if constexpr (sizeof __ix == 16 && __have_avx2)
1746 {
1747 const auto __ux = __vector_bitcast<unsigned>(__ix);
1748 const auto __uy = __vector_bitcast<unsigned>(__iy);
1749 return __intrin_bitcast<_V>(_mm_blend_epi16(
1750 __auto_bitcast(__ux << (__uy & 0x0000ffffu)),
1751 __auto_bitcast((__ux & 0xffff0000u) << (__uy >> 16)), 0xaa));
1752 }
1753 else if constexpr (sizeof __ix == 16)
1754 {
1755 using _Float4 = __vector_type_t<float, 4>;
1756 using _Int4 = __vector_type_t<int, 4>;
1757 using _UInt4 = __vector_type_t<unsigned, 4>;
1758 const _UInt4 __yu
1759 = reinterpret_cast<_UInt4>(__to_intrin(__y + (0x3f8 >> 3)));
1760 return __x
1761 * __intrin_bitcast<_V>(
1762 __vector_convert<_Int4>(_SimdWrapper<float, 4>(
1763 reinterpret_cast<_Float4>(__yu << 23)))
1764 | (__vector_convert<_Int4>(_SimdWrapper<float, 4>(
1765 reinterpret_cast<_Float4>((__yu >> 16) << 23)))
1766 << 16));
1767 }
1768 else
1769 __assert_unreachable<_Tp>();
1770 }
1771 else if constexpr (sizeof(_Up) == 4 && sizeof __ix == 16
1772 && !__have_avx2)
1773 // latency is suboptimal, but throughput is at full speedup
1774 return __intrin_bitcast<_V>(
1775 __vector_bitcast<unsigned>(__ix)
1776 * __vector_convert<__vector_type16_t<int>>(
1777 _SimdWrapper<float, 4>(__vector_bitcast<float>(
1778 (__vector_bitcast<unsigned, 4>(__y) << 23) + 0x3f80'0000))));
1779 else if constexpr (sizeof(_Up) == 8 && sizeof __ix == 16
1780 && !__have_avx2)
1781 {
1782 const auto __lo = _mm_sll_epi64(__ix, __iy);
1783 const auto __hi
1784 = _mm_sll_epi64(__ix, _mm_unpackhi_epi64(__iy, __iy));
1785 if constexpr (__have_sse4_1)
1786 return __vector_bitcast<_Up>(_mm_blend_epi16(__lo, __hi, 0xf0));
1787 else
1788 return __vector_bitcast<_Up>(
1789 _mm_move_sd(__vector_bitcast<double>(__hi),
1790 __vector_bitcast<double>(__lo)));
1791 }
1792 else
1793 return __x << __y;
1794 }
1795#endif // _GLIBCXX_SIMD_NO_SHIFT_OPT
1796
1797 // }}}
1798 // _S_bit_shift_right {{{
1799#ifndef _GLIBCXX_SIMD_NO_SHIFT_OPT
1800 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
1801 inline _GLIBCXX_CONST static typename _TVT::type
1802 _S_bit_shift_right(_Tp __xx, int __y)
1803 {
1804 using _V = typename _TVT::type;
1805 using _Up = typename _TVT::value_type;
1806 _V __x = __xx;
1807 [[maybe_unused]] const auto __ix = __to_intrin(__x);
1808 if (__builtin_is_constant_evaluated())
1809 return __x >> __y;
1810 else if (__builtin_constant_p(__y)
1811 && is_unsigned_v<
1812 _Up> && __y >= int(sizeof(_Up) * __CHAR_BIT__))
1813 return _V();
1814 else if constexpr (sizeof(_Up) == 1 && is_unsigned_v<_Up>) //{{{
1815 return __intrin_bitcast<_V>(__vector_bitcast<_UShort>(__ix) >> __y)
1816 & _Up(0xff >> __y);
1817 //}}}
1818 else if constexpr (sizeof(_Up) == 1 && is_signed_v<_Up>) //{{{
1819 return __intrin_bitcast<_V>(
1820 (__vector_bitcast<_UShort>(__vector_bitcast<short>(__ix)
1821 >> (__y + 8))
1822 << 8)
1823 | (__vector_bitcast<_UShort>(
1824 __vector_bitcast<short>(__vector_bitcast<_UShort>(__ix) << 8)
1825 >> __y)
1826 >> 8));
1827 //}}}
1828 // GCC optimizes sizeof == 2, 4, and unsigned 8 as expected
1829 else if constexpr (sizeof(_Up) == 8 && is_signed_v<_Up>) //{{{
1830 {
1831 if (__y > 32)
1832 return (__intrin_bitcast<_V>(__vector_bitcast<int>(__ix) >> 32)
1833 & _Up(0xffff'ffff'0000'0000ull))
1834 | __vector_bitcast<_Up>(
1835 __vector_bitcast<int>(__vector_bitcast<_ULLong>(__ix)
1836 >> 32)
1837 >> (__y - 32));
1838 else
1839 return __intrin_bitcast<_V>(__vector_bitcast<_ULLong>(__ix)
1840 >> __y)
1841 | __vector_bitcast<_Up>(
1842 __vector_bitcast<int>(__ix & -0x8000'0000'0000'0000ll)
1843 >> __y);
1844 }
1845 //}}}
1846 else
1847 return __x >> __y;
1848 }
1849
1850 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
1851 inline _GLIBCXX_CONST static typename _TVT::type
1852 _S_bit_shift_right(_Tp __xx, typename _TVT::type __y)
1853 {
1854 using _V = typename _TVT::type;
1855 using _Up = typename _TVT::value_type;
1856 _V __x = __xx;
1857 [[maybe_unused]] const auto __ix = __to_intrin(__x);
1858 [[maybe_unused]] const auto __iy = __to_intrin(__y);
1859 if (__builtin_is_constant_evaluated()
1860 || (__builtin_constant_p(__x) && __builtin_constant_p(__y)))
1861 return __x >> __y;
1862 else if constexpr (sizeof(_Up) == 1) //{{{
1863 {
1864 if constexpr (sizeof(__x) <= 8 && __have_avx512bw_vl)
1865 return __intrin_bitcast<_V>(_mm_cvtepi16_epi8(
1866 is_signed_v<_Up> ? _mm_srav_epi16(_mm_cvtepi8_epi16(__ix),
1867 _mm_cvtepi8_epi16(__iy))
1868 : _mm_srlv_epi16(_mm_cvtepu8_epi16(__ix),
1869 _mm_cvtepu8_epi16(__iy))));
1870 if constexpr (sizeof(__x) == 16 && __have_avx512bw_vl)
1871 return __intrin_bitcast<_V>(_mm256_cvtepi16_epi8(
1872 is_signed_v<_Up>
1873 ? _mm256_srav_epi16(_mm256_cvtepi8_epi16(__ix),
1874 _mm256_cvtepi8_epi16(__iy))
1875 : _mm256_srlv_epi16(_mm256_cvtepu8_epi16(__ix),
1876 _mm256_cvtepu8_epi16(__iy))));
1877 else if constexpr (sizeof(__x) == 32 && __have_avx512bw)
1878 return __vector_bitcast<_Up>(_mm512_cvtepi16_epi8(
1879 is_signed_v<_Up>
1880 ? _mm512_srav_epi16(_mm512_cvtepi8_epi16(__ix),
1881 _mm512_cvtepi8_epi16(__iy))
1882 : _mm512_srlv_epi16(_mm512_cvtepu8_epi16(__ix),
1883 _mm512_cvtepu8_epi16(__iy))));
1884 else if constexpr (sizeof(__x) == 64 && is_signed_v<_Up>)
1885 return __vector_bitcast<_Up>(_mm512_mask_mov_epi8(
1886 _mm512_srav_epi16(__ix, _mm512_srli_epi16(__iy, 8)),
1887 0x5555'5555'5555'5555ull,
1888 _mm512_srav_epi16(
1889 _mm512_slli_epi16(__ix, 8),
1890 _mm512_maskz_add_epi8(0x5555'5555'5555'5555ull, __iy,
1891 _mm512_set1_epi16(8)))));
1892 else if constexpr (sizeof(__x) == 64 && is_unsigned_v<_Up>)
1893 return __vector_bitcast<_Up>(_mm512_mask_mov_epi8(
1894 _mm512_srlv_epi16(__ix, _mm512_srli_epi16(__iy, 8)),
1895 0x5555'5555'5555'5555ull,
1896 _mm512_srlv_epi16(
1897 _mm512_maskz_mov_epi8(0x5555'5555'5555'5555ull, __ix),
1898 _mm512_maskz_mov_epi8(0x5555'5555'5555'5555ull, __iy))));
1899 /* This has better throughput but higher latency than the impl below
1900 else if constexpr (__have_avx2 && sizeof(__x) == 16 &&
1901 is_unsigned_v<_Up>)
1902 {
1903 const auto __shorts = __to_intrin(_S_bit_shift_right(
1904 __vector_bitcast<_UShort>(_mm256_cvtepu8_epi16(__ix)),
1905 __vector_bitcast<_UShort>(_mm256_cvtepu8_epi16(__iy))));
1906 return __vector_bitcast<_Up>(
1907 _mm_packus_epi16(__lo128(__shorts), __hi128(__shorts)));
1908 }
1909 */
1910 else if constexpr (__have_avx2 && sizeof(__x) > 8)
1911 // the following uses vpsr[al]vd, which requires AVX2
1912 if constexpr (is_signed_v<_Up>)
1913 {
1914 const auto r3 = __vector_bitcast<_UInt>(
1915 (__vector_bitcast<int>(__x)
1916 >> (__vector_bitcast<_UInt>(__y) >> 24)))
1917 & 0xff000000u;
1918 const auto r2
1919 = __vector_bitcast<_UInt>(
1920 ((__vector_bitcast<int>(__x) << 8)
1921 >> ((__vector_bitcast<_UInt>(__y) << 8) >> 24)))
1922 & 0xff000000u;
1923 const auto r1
1924 = __vector_bitcast<_UInt>(
1925 ((__vector_bitcast<int>(__x) << 16)
1926 >> ((__vector_bitcast<_UInt>(__y) << 16) >> 24)))
1927 & 0xff000000u;
1928 const auto r0 = __vector_bitcast<_UInt>(
1929 (__vector_bitcast<int>(__x) << 24)
1930 >> ((__vector_bitcast<_UInt>(__y) << 24) >> 24));
1931 return __vector_bitcast<_Up>(r3 | (r2 >> 8) | (r1 >> 16)
1932 | (r0 >> 24));
1933 }
1934 else
1935 {
1936 const auto r3 = (__vector_bitcast<_UInt>(__x)
1937 >> (__vector_bitcast<_UInt>(__y) >> 24))
1938 & 0xff000000u;
1939 const auto r2
1940 = ((__vector_bitcast<_UInt>(__x) << 8)
1941 >> ((__vector_bitcast<_UInt>(__y) << 8) >> 24))
1942 & 0xff000000u;
1943 const auto r1
1944 = ((__vector_bitcast<_UInt>(__x) << 16)
1945 >> ((__vector_bitcast<_UInt>(__y) << 16) >> 24))
1946 & 0xff000000u;
1947 const auto r0
1948 = (__vector_bitcast<_UInt>(__x) << 24)
1949 >> ((__vector_bitcast<_UInt>(__y) << 24) >> 24);
1950 return __vector_bitcast<_Up>(r3 | (r2 >> 8) | (r1 >> 16)
1951 | (r0 >> 24));
1952 }
1953 else if constexpr (__have_sse4_1
1954 && is_unsigned_v<_Up> && sizeof(__x) > 2)
1955 {
1956 auto __x128 = __vector_bitcast<_Up>(__ix);
1957 auto __mask
1958 = __vector_bitcast<_Up>(__vector_bitcast<_UShort>(__iy) << 5);
1959 auto __x4 = __vector_bitcast<_Up>(
1960 (__vector_bitcast<_UShort>(__x128) >> 4) & _UShort(0xff0f));
1961 __x128 = __vector_bitcast<_Up>(_CommonImplX86::_S_blend_intrin(
1962 __to_intrin(__mask), __to_intrin(__x128), __to_intrin(__x4)));
1963 __mask += __mask;
1964 auto __x2 = __vector_bitcast<_Up>(
1965 (__vector_bitcast<_UShort>(__x128) >> 2) & _UShort(0xff3f));
1966 __x128 = __vector_bitcast<_Up>(_CommonImplX86::_S_blend_intrin(
1967 __to_intrin(__mask), __to_intrin(__x128), __to_intrin(__x2)));
1968 __mask += __mask;
1969 auto __x1 = __vector_bitcast<_Up>(
1970 (__vector_bitcast<_UShort>(__x128) >> 1) & _UShort(0xff7f));
1971 __x128 = __vector_bitcast<_Up>(_CommonImplX86::_S_blend_intrin(
1972 __to_intrin(__mask), __to_intrin(__x128), __to_intrin(__x1)));
1973 return __intrin_bitcast<_V>(
1974 __x128
1975 & ((__vector_bitcast<_Up>(__iy) & char(0xf8))
1976 == 0)); // y > 7 nulls the result
1977 }
1978 else if constexpr (__have_sse4_1
1979 && is_signed_v<_Up> && sizeof(__x) > 2)
1980 {
1981 auto __mask = __vector_bitcast<_UChar>(
1982 __vector_bitcast<_UShort>(__iy) << 5);
1983 auto __maskl = [&]() {
1984 return __to_intrin(__vector_bitcast<_UShort>(__mask) << 8);
1985 };
1986 auto __xh = __vector_bitcast<short>(__ix);
1987 auto __xl = __vector_bitcast<short>(__ix) << 8;
1988 auto __xh4 = __xh >> 4;
1989 auto __xl4 = __xl >> 4;
1990 __xh = __vector_bitcast<short>(_CommonImplX86::_S_blend_intrin(
1991 __to_intrin(__mask), __to_intrin(__xh), __to_intrin(__xh4)));
1992 __xl = __vector_bitcast<short>(
1993 _CommonImplX86::_S_blend_intrin(__maskl(), __to_intrin(__xl),
1994 __to_intrin(__xl4)));
1995 __mask += __mask;
1996 auto __xh2 = __xh >> 2;
1997 auto __xl2 = __xl >> 2;
1998 __xh = __vector_bitcast<short>(_CommonImplX86::_S_blend_intrin(
1999 __to_intrin(__mask), __to_intrin(__xh), __to_intrin(__xh2)));
2000 __xl = __vector_bitcast<short>(
2001 _CommonImplX86::_S_blend_intrin(__maskl(), __to_intrin(__xl),
2002 __to_intrin(__xl2)));
2003 __mask += __mask;
2004 auto __xh1 = __xh >> 1;
2005 auto __xl1 = __xl >> 1;
2006 __xh = __vector_bitcast<short>(_CommonImplX86::_S_blend_intrin(
2007 __to_intrin(__mask), __to_intrin(__xh), __to_intrin(__xh1)));
2008 __xl = __vector_bitcast<short>(
2009 _CommonImplX86::_S_blend_intrin(__maskl(), __to_intrin(__xl),
2010 __to_intrin(__xl1)));
2011 return __intrin_bitcast<_V>(
2012 (__vector_bitcast<_Up>((__xh & short(0xff00)))
2013 | __vector_bitcast<_Up>(__vector_bitcast<_UShort>(__xl)
2014 >> 8))
2015 & ((__vector_bitcast<_Up>(__iy) & char(0xf8))
2016 == 0)); // y > 7 nulls the result
2017 }
2018 else if constexpr (is_unsigned_v<_Up> && sizeof(__x) > 2) // SSE2
2019 {
2020 auto __mask
2021 = __vector_bitcast<_Up>(__vector_bitcast<_UShort>(__y) << 5);
2022 auto __x4 = __vector_bitcast<_Up>(
2023 (__vector_bitcast<_UShort>(__x) >> 4) & _UShort(0xff0f));
2024 __x = __mask > 0x7f ? __x4 : __x;
2025 __mask += __mask;
2026 auto __x2 = __vector_bitcast<_Up>(
2027 (__vector_bitcast<_UShort>(__x) >> 2) & _UShort(0xff3f));
2028 __x = __mask > 0x7f ? __x2 : __x;
2029 __mask += __mask;
2030 auto __x1 = __vector_bitcast<_Up>(
2031 (__vector_bitcast<_UShort>(__x) >> 1) & _UShort(0xff7f));
2032 __x = __mask > 0x7f ? __x1 : __x;
2033 return __x
2034 & ((__y & char(0xf8)) == 0); // y > 7 nulls the result
2035 }
2036 else if constexpr (sizeof(__x) > 2) // signed SSE2
2037 {
2038 static_assert(is_signed_v<_Up>);
2039 auto __maskh = __vector_bitcast<_UShort>(__y) << 5;
2040 auto __maskl = __vector_bitcast<_UShort>(__y) << (5 + 8);
2041 auto __xh = __vector_bitcast<short>(__x);
2042 auto __xl = __vector_bitcast<short>(__x) << 8;
2043 auto __xh4 = __xh >> 4;
2044 auto __xl4 = __xl >> 4;
2045 __xh = __maskh > 0x7fff ? __xh4 : __xh;
2046 __xl = __maskl > 0x7fff ? __xl4 : __xl;
2047 __maskh += __maskh;
2048 __maskl += __maskl;
2049 auto __xh2 = __xh >> 2;
2050 auto __xl2 = __xl >> 2;
2051 __xh = __maskh > 0x7fff ? __xh2 : __xh;
2052 __xl = __maskl > 0x7fff ? __xl2 : __xl;
2053 __maskh += __maskh;
2054 __maskl += __maskl;
2055 auto __xh1 = __xh >> 1;
2056 auto __xl1 = __xl >> 1;
2057 __xh = __maskh > 0x7fff ? __xh1 : __xh;
2058 __xl = __maskl > 0x7fff ? __xl1 : __xl;
2059 __x = __vector_bitcast<_Up>((__xh & short(0xff00)))
2060 | __vector_bitcast<_Up>(__vector_bitcast<_UShort>(__xl)
2061 >> 8);
2062 return __x
2063 & ((__y & char(0xf8)) == 0); // y > 7 nulls the result
2064 }
2065 else
2066 return __x >> __y;
2067 } //}}}
2068 else if constexpr (sizeof(_Up) == 2 && sizeof(__x) >= 4) //{{{
2069 {
2070 [[maybe_unused]] auto __blend_0xaa = [](auto __a, auto __b) {
2071 if constexpr (sizeof(__a) == 16)
2072 return _mm_blend_epi16(__to_intrin(__a), __to_intrin(__b),
2073 0xaa);
2074 else if constexpr (sizeof(__a) == 32)
2075 return _mm256_blend_epi16(__to_intrin(__a), __to_intrin(__b),
2076 0xaa);
2077 else if constexpr (sizeof(__a) == 64)
2078 return _mm512_mask_blend_epi16(0xaaaa'aaaaU, __to_intrin(__a),
2079 __to_intrin(__b));
2080 else
2081 __assert_unreachable<decltype(__a)>();
2082 };
2083 if constexpr (__have_avx512bw_vl && sizeof(_Tp) <= 16)
2084 return __intrin_bitcast<_V>(is_signed_v<_Up>
2085 ? _mm_srav_epi16(__ix, __iy)
2086 : _mm_srlv_epi16(__ix, __iy));
2087 else if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 32)
2088 return __vector_bitcast<_Up>(is_signed_v<_Up>
2089 ? _mm256_srav_epi16(__ix, __iy)
2090 : _mm256_srlv_epi16(__ix, __iy));
2091 else if constexpr (__have_avx512bw && sizeof(_Tp) == 64)
2092 return __vector_bitcast<_Up>(is_signed_v<_Up>
2093 ? _mm512_srav_epi16(__ix, __iy)
2094 : _mm512_srlv_epi16(__ix, __iy));
2095 else if constexpr (__have_avx2 && is_signed_v<_Up>)
2096 return __intrin_bitcast<_V>(
2097 __blend_0xaa(((__vector_bitcast<int>(__ix) << 16)
2098 >> (__vector_bitcast<int>(__iy) & 0xffffu))
2099 >> 16,
2100 __vector_bitcast<int>(__ix)
2101 >> (__vector_bitcast<int>(__iy) >> 16)));
2102 else if constexpr (__have_avx2 && is_unsigned_v<_Up>)
2103 return __intrin_bitcast<_V>(
2104 __blend_0xaa((__vector_bitcast<_UInt>(__ix) & 0xffffu)
2105 >> (__vector_bitcast<_UInt>(__iy) & 0xffffu),
2106 __vector_bitcast<_UInt>(__ix)
2107 >> (__vector_bitcast<_UInt>(__iy) >> 16)));
2108 else if constexpr (__have_sse4_1)
2109 {
2110 auto __mask = __vector_bitcast<_UShort>(__iy);
2111 auto __x128 = __vector_bitcast<_Up>(__ix);
2112 //__mask *= 0x0808;
2113 __mask = (__mask << 3) | (__mask << 11);
2114 // do __x128 = 0 where __y[4] is set
2115 __x128 = __vector_bitcast<_Up>(
2116 _mm_blendv_epi8(__to_intrin(__x128), __m128i(),
2117 __to_intrin(__mask)));
2118 // do __x128 =>> 8 where __y[3] is set
2119 __x128 = __vector_bitcast<_Up>(
2120 _mm_blendv_epi8(__to_intrin(__x128), __to_intrin(__x128 >> 8),
2121 __to_intrin(__mask += __mask)));
2122 // do __x128 =>> 4 where __y[2] is set
2123 __x128 = __vector_bitcast<_Up>(
2124 _mm_blendv_epi8(__to_intrin(__x128), __to_intrin(__x128 >> 4),
2125 __to_intrin(__mask += __mask)));
2126 // do __x128 =>> 2 where __y[1] is set
2127 __x128 = __vector_bitcast<_Up>(
2128 _mm_blendv_epi8(__to_intrin(__x128), __to_intrin(__x128 >> 2),
2129 __to_intrin(__mask += __mask)));
2130 // do __x128 =>> 1 where __y[0] is set
2131 return __intrin_bitcast<_V>(
2132 _mm_blendv_epi8(__to_intrin(__x128), __to_intrin(__x128 >> 1),
2133 __to_intrin(__mask + __mask)));
2134 }
2135 else
2136 {
2137 auto __k = __vector_bitcast<_UShort>(__iy) << 11;
2138 auto __x128 = __vector_bitcast<_Up>(__ix);
2139 auto __mask = [](__vector_type16_t<_UShort> __kk) {
2140 return __vector_bitcast<short>(__kk) < 0;
2141 };
2142 // do __x128 = 0 where __y[4] is set
2143 __x128 = __mask(__k) ? decltype(__x128)() : __x128;
2144 // do __x128 =>> 8 where __y[3] is set
2145 __x128 = __mask(__k += __k) ? __x128 >> 8 : __x128;
2146 // do __x128 =>> 4 where __y[2] is set
2147 __x128 = __mask(__k += __k) ? __x128 >> 4 : __x128;
2148 // do __x128 =>> 2 where __y[1] is set
2149 __x128 = __mask(__k += __k) ? __x128 >> 2 : __x128;
2150 // do __x128 =>> 1 where __y[0] is set
2151 return __intrin_bitcast<_V>(__mask(__k + __k) ? __x128 >> 1
2152 : __x128);
2153 }
2154 } //}}}
2155 else if constexpr (sizeof(_Up) == 4 && !__have_avx2) //{{{
2156 {
2157 if constexpr (is_unsigned_v<_Up>)
2158 {
2159 // x >> y == x * 2^-y == (x * 2^(31-y)) >> 31
2160 const __m128 __factor_f = reinterpret_cast<__m128>(
2161 0x4f00'0000u - (__vector_bitcast<unsigned, 4>(__y) << 23));
2162 const __m128i __factor
2163 = __builtin_constant_p(__factor_f)
2164 ? __to_intrin(
2165 __make_vector<unsigned>(__factor_f[0], __factor_f[1],
2166 __factor_f[2], __factor_f[3]))
2167 : _mm_cvttps_epi32(__factor_f);
2168 const auto __r02
2169 = _mm_srli_epi64(_mm_mul_epu32(__ix, __factor), 31);
2170 const auto __r13 = _mm_mul_epu32(_mm_srli_si128(__ix, 4),
2171 _mm_srli_si128(__factor, 4));
2172 if constexpr (__have_sse4_1)
2173 return __intrin_bitcast<_V>(
2174 _mm_blend_epi16(_mm_slli_epi64(__r13, 1), __r02, 0x33));
2175 else
2176 return __intrin_bitcast<_V>(
2177 __r02 | _mm_slli_si128(_mm_srli_epi64(__r13, 31), 4));
2178 }
2179 else
2180 {
2181 auto __shift = [](auto __a, auto __b) {
2182 if constexpr (is_signed_v<_Up>)
2183 return _mm_sra_epi32(__a, __b);
2184 else
2185 return _mm_srl_epi32(__a, __b);
2186 };
2187 const auto __r0
2188 = __shift(__ix, _mm_unpacklo_epi32(__iy, __m128i()));
2189 const auto __r1 = __shift(__ix, _mm_srli_epi64(__iy, 32));
2190 const auto __r2
2191 = __shift(__ix, _mm_unpackhi_epi32(__iy, __m128i()));
2192 const auto __r3 = __shift(__ix, _mm_srli_si128(__iy, 12));
2193 if constexpr (__have_sse4_1)
2194 return __intrin_bitcast<_V>(
2195 _mm_blend_epi16(_mm_blend_epi16(__r1, __r0, 0x3),
2196 _mm_blend_epi16(__r3, __r2, 0x30), 0xf0));
2197 else
2198 return __intrin_bitcast<_V>(_mm_unpacklo_epi64(
2199 _mm_unpacklo_epi32(__r0, _mm_srli_si128(__r1, 4)),
2200 _mm_unpackhi_epi32(__r2, _mm_srli_si128(__r3, 4))));
2201 }
2202 } //}}}
2203 else
2204 return __x >> __y;
2205 }
2206#endif // _GLIBCXX_SIMD_NO_SHIFT_OPT
2207
2208 // }}}
2209 // compares {{{
2210 // _S_equal_to {{{
2211 template <typename _Tp, size_t _Np>
2212 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
2213 _S_equal_to(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
2214 {
2215 if constexpr (__is_avx512_abi<_Abi>()) // {{{
2216 {
2217 if (__builtin_is_constant_evaluated()
2218 || (__x._M_is_constprop() && __y._M_is_constprop()))
2219 return _MaskImpl::_S_to_bits(
2220 __as_wrapper<_Np>(__x._M_data == __y._M_data));
2221
2222 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
2223 [[maybe_unused]] const auto __xi = __to_intrin(__x);
2224 [[maybe_unused]] const auto __yi = __to_intrin(__y);
2225 if constexpr (is_floating_point_v<_Tp>)
2226 {
2227 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
2228 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_EQ_OQ);
2229 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
2230 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_EQ_OQ);
2231 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
2232 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_EQ_OQ);
2233 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
2234 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_EQ_OQ);
2235 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
2236 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_EQ_OQ);
2237 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
2238 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_EQ_OQ);
2239 else
2240 __assert_unreachable<_Tp>();
2241 }
2242 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
2243 return _mm512_mask_cmpeq_epi64_mask(__k1, __xi, __yi);
2244 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
2245 return _mm512_mask_cmpeq_epi32_mask(__k1, __xi, __yi);
2246 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 2)
2247 return _mm512_mask_cmpeq_epi16_mask(__k1, __xi, __yi);
2248 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 1)
2249 return _mm512_mask_cmpeq_epi8_mask(__k1, __xi, __yi);
2250 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
2251 return _mm256_mask_cmpeq_epi64_mask(__k1, __xi, __yi);
2252 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
2253 return _mm256_mask_cmpeq_epi32_mask(__k1, __xi, __yi);
2254 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 2)
2255 return _mm256_mask_cmpeq_epi16_mask(__k1, __xi, __yi);
2256 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 1)
2257 return _mm256_mask_cmpeq_epi8_mask(__k1, __xi, __yi);
2258 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
2259 return _mm_mask_cmpeq_epi64_mask(__k1, __xi, __yi);
2260 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
2261 return _mm_mask_cmpeq_epi32_mask(__k1, __xi, __yi);
2262 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 2)
2263 return _mm_mask_cmpeq_epi16_mask(__k1, __xi, __yi);
2264 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 1)
2265 return _mm_mask_cmpeq_epi8_mask(__k1, __xi, __yi);
2266 else
2267 __assert_unreachable<_Tp>();
2268 } // }}}
2269 else if (__builtin_is_constant_evaluated())
2270 return _Base::_S_equal_to(__x, __y);
2271 else if constexpr (sizeof(__x) == 8) // {{{
2272 {
2273 const auto __r128 = __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__x)
2274 == __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__y);
2275 _MaskMember<_Tp> __r64;
2276 __builtin_memcpy(&__r64._M_data, &__r128, sizeof(__r64));
2277 return __r64;
2278 } // }}}
2279 else
2280 return _Base::_S_equal_to(__x, __y);
2281 }
2282
2283 // }}}
2284 // _S_not_equal_to {{{
2285 template <typename _Tp, size_t _Np>
2286 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
2287 _S_not_equal_to(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
2288 {
2289 if constexpr (__is_avx512_abi<_Abi>()) // {{{
2290 {
2291 if (__builtin_is_constant_evaluated()
2292 || (__x._M_is_constprop() && __y._M_is_constprop()))
2293 return _MaskImpl::_S_to_bits(
2294 __as_wrapper<_Np>(__x._M_data != __y._M_data));
2295
2296 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
2297 [[maybe_unused]] const auto __xi = __to_intrin(__x);
2298 [[maybe_unused]] const auto __yi = __to_intrin(__y);
2299 if constexpr (is_floating_point_v<_Tp>)
2300 {
2301 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
2302 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_UQ);
2303 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
2304 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_UQ);
2305 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
2306 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_UQ);
2307 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
2308 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_UQ);
2309 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
2310 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_UQ);
2311 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
2312 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_UQ);
2313 else
2314 __assert_unreachable<_Tp>();
2315 }
2316 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
2317 return ~_mm512_mask_cmpeq_epi64_mask(__k1, __xi, __yi);
2318 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
2319 return ~_mm512_mask_cmpeq_epi32_mask(__k1, __xi, __yi);
2320 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 2)
2321 return ~_mm512_mask_cmpeq_epi16_mask(__k1, __xi, __yi);
2322 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 1)
2323 return ~_mm512_mask_cmpeq_epi8_mask(__k1, __xi, __yi);
2324 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
2325 return ~_mm256_mask_cmpeq_epi64_mask(__k1, __xi, __yi);
2326 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
2327 return ~_mm256_mask_cmpeq_epi32_mask(__k1, __xi, __yi);
2328 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 2)
2329 return ~_mm256_mask_cmpeq_epi16_mask(__k1, __xi, __yi);
2330 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 1)
2331 return ~_mm256_mask_cmpeq_epi8_mask(__k1, __xi, __yi);
2332 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
2333 return ~_mm_mask_cmpeq_epi64_mask(__k1, __xi, __yi);
2334 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
2335 return ~_mm_mask_cmpeq_epi32_mask(__k1, __xi, __yi);
2336 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 2)
2337 return ~_mm_mask_cmpeq_epi16_mask(__k1, __xi, __yi);
2338 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 1)
2339 return ~_mm_mask_cmpeq_epi8_mask(__k1, __xi, __yi);
2340 else
2341 __assert_unreachable<_Tp>();
2342 } // }}}
2343 else if constexpr (!__builtin_is_constant_evaluated() // {{{
2344 && sizeof(__x) == 8)
2345 {
2346 const auto __r128 = __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__x)
2347 != __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__y);
2348 _MaskMember<_Tp> __r64;
2349 __builtin_memcpy(&__r64._M_data, &__r128, sizeof(__r64));
2350 return __r64;
2351 } // }}}
2352 else
2353 return _Base::_S_not_equal_to(__x, __y);
2354 }
2355
2356 // }}}
2357 // _S_less {{{
2358 template <typename _Tp, size_t _Np>
2359 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
2360 _S_less(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
2361 {
2362 if constexpr (__is_avx512_abi<_Abi>()) // {{{
2363 {
2364 if (__builtin_is_constant_evaluated()
2365 || (__x._M_is_constprop() && __y._M_is_constprop()))
2366 return _MaskImpl::_S_to_bits(
2367 __as_wrapper<_Np>(__x._M_data < __y._M_data));
2368
2369 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
2370 [[maybe_unused]] const auto __xi = __to_intrin(__x);
2371 [[maybe_unused]] const auto __yi = __to_intrin(__y);
2372 if constexpr (sizeof(__xi) == 64)
2373 {
2374 if constexpr (is_same_v<_Tp, float>)
2375 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OS);
2376 else if constexpr (is_same_v<_Tp, double>)
2377 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OS);
2378 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1)
2379 return _mm512_mask_cmplt_epi8_mask(__k1, __xi, __yi);
2380 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2)
2381 return _mm512_mask_cmplt_epi16_mask(__k1, __xi, __yi);
2382 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4)
2383 return _mm512_mask_cmplt_epi32_mask(__k1, __xi, __yi);
2384 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8)
2385 return _mm512_mask_cmplt_epi64_mask(__k1, __xi, __yi);
2386 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1)
2387 return _mm512_mask_cmplt_epu8_mask(__k1, __xi, __yi);
2388 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2)
2389 return _mm512_mask_cmplt_epu16_mask(__k1, __xi, __yi);
2390 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4)
2391 return _mm512_mask_cmplt_epu32_mask(__k1, __xi, __yi);
2392 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8)
2393 return _mm512_mask_cmplt_epu64_mask(__k1, __xi, __yi);
2394 else
2395 __assert_unreachable<_Tp>();
2396 }
2397 else if constexpr (sizeof(__xi) == 32)
2398 {
2399 if constexpr (is_same_v<_Tp, float>)
2400 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OS);
2401 else if constexpr (is_same_v<_Tp, double>)
2402 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OS);
2403 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1)
2404 return _mm256_mask_cmplt_epi8_mask(__k1, __xi, __yi);
2405 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2)
2406 return _mm256_mask_cmplt_epi16_mask(__k1, __xi, __yi);
2407 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4)
2408 return _mm256_mask_cmplt_epi32_mask(__k1, __xi, __yi);
2409 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8)
2410 return _mm256_mask_cmplt_epi64_mask(__k1, __xi, __yi);
2411 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1)
2412 return _mm256_mask_cmplt_epu8_mask(__k1, __xi, __yi);
2413 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2)
2414 return _mm256_mask_cmplt_epu16_mask(__k1, __xi, __yi);
2415 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4)
2416 return _mm256_mask_cmplt_epu32_mask(__k1, __xi, __yi);
2417 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8)
2418 return _mm256_mask_cmplt_epu64_mask(__k1, __xi, __yi);
2419 else
2420 __assert_unreachable<_Tp>();
2421 }
2422 else if constexpr (sizeof(__xi) == 16)
2423 {
2424 if constexpr (is_same_v<_Tp, float>)
2425 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OS);
2426 else if constexpr (is_same_v<_Tp, double>)
2427 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OS);
2428 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1)
2429 return _mm_mask_cmplt_epi8_mask(__k1, __xi, __yi);
2430 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2)
2431 return _mm_mask_cmplt_epi16_mask(__k1, __xi, __yi);
2432 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4)
2433 return _mm_mask_cmplt_epi32_mask(__k1, __xi, __yi);
2434 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8)
2435 return _mm_mask_cmplt_epi64_mask(__k1, __xi, __yi);
2436 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1)
2437 return _mm_mask_cmplt_epu8_mask(__k1, __xi, __yi);
2438 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2)
2439 return _mm_mask_cmplt_epu16_mask(__k1, __xi, __yi);
2440 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4)
2441 return _mm_mask_cmplt_epu32_mask(__k1, __xi, __yi);
2442 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8)
2443 return _mm_mask_cmplt_epu64_mask(__k1, __xi, __yi);
2444 else
2445 __assert_unreachable<_Tp>();
2446 }
2447 else
2448 __assert_unreachable<_Tp>();
2449 } // }}}
2450 else if constexpr (!__builtin_is_constant_evaluated() // {{{
2451 && sizeof(__x) == 8)
2452 {
2453 const auto __r128 = __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__x)
2454 < __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__y);
2455 _MaskMember<_Tp> __r64;
2456 __builtin_memcpy(&__r64._M_data, &__r128, sizeof(__r64));
2457 return __r64;
2458 } // }}}
2459 else
2460 return _Base::_S_less(__x, __y);
2461 }
2462
2463 // }}}
2464 // _S_less_equal {{{
2465 template <typename _Tp, size_t _Np>
2466 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
2467 _S_less_equal(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
2468 {
2469 if constexpr (__is_avx512_abi<_Abi>()) // {{{
2470 {
2471 if (__builtin_is_constant_evaluated()
2472 || (__x._M_is_constprop() && __y._M_is_constprop()))
2473 return _MaskImpl::_S_to_bits(
2474 __as_wrapper<_Np>(__x._M_data <= __y._M_data));
2475
2476 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
2477 [[maybe_unused]] const auto __xi = __to_intrin(__x);
2478 [[maybe_unused]] const auto __yi = __to_intrin(__y);
2479 if constexpr (sizeof(__xi) == 64)
2480 {
2481 if constexpr (is_same_v<_Tp, float>)
2482 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OS);
2483 else if constexpr (is_same_v<_Tp, double>)
2484 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OS);
2485 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1)
2486 return _mm512_mask_cmple_epi8_mask(__k1, __xi, __yi);
2487 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2)
2488 return _mm512_mask_cmple_epi16_mask(__k1, __xi, __yi);
2489 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4)
2490 return _mm512_mask_cmple_epi32_mask(__k1, __xi, __yi);
2491 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8)
2492 return _mm512_mask_cmple_epi64_mask(__k1, __xi, __yi);
2493 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1)
2494 return _mm512_mask_cmple_epu8_mask(__k1, __xi, __yi);
2495 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2)
2496 return _mm512_mask_cmple_epu16_mask(__k1, __xi, __yi);
2497 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4)
2498 return _mm512_mask_cmple_epu32_mask(__k1, __xi, __yi);
2499 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8)
2500 return _mm512_mask_cmple_epu64_mask(__k1, __xi, __yi);
2501 else
2502 __assert_unreachable<_Tp>();
2503 }
2504 else if constexpr (sizeof(__xi) == 32)
2505 {
2506 if constexpr (is_same_v<_Tp, float>)
2507 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OS);
2508 else if constexpr (is_same_v<_Tp, double>)
2509 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OS);
2510 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1)
2511 return _mm256_mask_cmple_epi8_mask(__k1, __xi, __yi);
2512 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2)
2513 return _mm256_mask_cmple_epi16_mask(__k1, __xi, __yi);
2514 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4)
2515 return _mm256_mask_cmple_epi32_mask(__k1, __xi, __yi);
2516 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8)
2517 return _mm256_mask_cmple_epi64_mask(__k1, __xi, __yi);
2518 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1)
2519 return _mm256_mask_cmple_epu8_mask(__k1, __xi, __yi);
2520 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2)
2521 return _mm256_mask_cmple_epu16_mask(__k1, __xi, __yi);
2522 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4)
2523 return _mm256_mask_cmple_epu32_mask(__k1, __xi, __yi);
2524 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8)
2525 return _mm256_mask_cmple_epu64_mask(__k1, __xi, __yi);
2526 else
2527 __assert_unreachable<_Tp>();
2528 }
2529 else if constexpr (sizeof(__xi) == 16)
2530 {
2531 if constexpr (is_same_v<_Tp, float>)
2532 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OS);
2533 else if constexpr (is_same_v<_Tp, double>)
2534 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OS);
2535 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1)
2536 return _mm_mask_cmple_epi8_mask(__k1, __xi, __yi);
2537 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2)
2538 return _mm_mask_cmple_epi16_mask(__k1, __xi, __yi);
2539 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4)
2540 return _mm_mask_cmple_epi32_mask(__k1, __xi, __yi);
2541 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8)
2542 return _mm_mask_cmple_epi64_mask(__k1, __xi, __yi);
2543 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1)
2544 return _mm_mask_cmple_epu8_mask(__k1, __xi, __yi);
2545 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2)
2546 return _mm_mask_cmple_epu16_mask(__k1, __xi, __yi);
2547 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4)
2548 return _mm_mask_cmple_epu32_mask(__k1, __xi, __yi);
2549 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8)
2550 return _mm_mask_cmple_epu64_mask(__k1, __xi, __yi);
2551 else
2552 __assert_unreachable<_Tp>();
2553 }
2554 else
2555 __assert_unreachable<_Tp>();
2556 } // }}}
2557 else if constexpr (!__builtin_is_constant_evaluated() // {{{
2558 && sizeof(__x) == 8)
2559 {
2560 const auto __r128 = __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__x)
2561 <= __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__y);
2562 _MaskMember<_Tp> __r64;
2563 __builtin_memcpy(&__r64._M_data, &__r128, sizeof(__r64));
2564 return __r64;
2565 } // }}}
2566 else
2567 return _Base::_S_less_equal(__x, __y);
2568 }
2569
2570 // }}} }}}
2571 // negation {{{
2572 template <typename _Tp, size_t _Np>
2573 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
2574 _S_negate(_SimdWrapper<_Tp, _Np> __x) noexcept
2575 {
2576 if constexpr (__is_avx512_abi<_Abi>())
2577 return _S_equal_to(__x, _SimdWrapper<_Tp, _Np>());
2578 else
2579 return _Base::_S_negate(__x);
2580 }
2581
2582 // }}}
2583 // math {{{
2584 using _Base::_S_abs;
2585
2586 // _S_sqrt {{{
2587 template <typename _Tp, size_t _Np>
2588 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
2589 _S_sqrt(_SimdWrapper<_Tp, _Np> __x)
2590 {
2591 if constexpr (__is_sse_ps<_Tp, _Np>())
2592 return __auto_bitcast(_mm_sqrt_ps(__to_intrin(__x)));
2593 else if constexpr (__is_sse_pd<_Tp, _Np>())
2594 return _mm_sqrt_pd(__x);
2595 else if constexpr (__is_avx_ps<_Tp, _Np>())
2596 return _mm256_sqrt_ps(__x);
2597 else if constexpr (__is_avx_pd<_Tp, _Np>())
2598 return _mm256_sqrt_pd(__x);
2599 else if constexpr (__is_avx512_ps<_Tp, _Np>())
2600 return _mm512_sqrt_ps(__x);
2601 else if constexpr (__is_avx512_pd<_Tp, _Np>())
2602 return _mm512_sqrt_pd(__x);
2603 else
2604 __assert_unreachable<_Tp>();
2605 }
2606
2607 // }}}
2608 // _S_ldexp {{{
2609 template <typename _Tp, size_t _Np>
2610 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
2611 _S_ldexp(_SimdWrapper<_Tp, _Np> __x,
2612 __fixed_size_storage_t<int, _Np> __exp)
2613 {
2614 if constexpr (sizeof(__x) == 64 || __have_avx512vl)
2615 {
2616 const auto __xi = __to_intrin(__x);
2617 constexpr _SimdConverter<int, simd_abi::fixed_size<_Np>, _Tp, _Abi>
2618 __cvt;
2619 const auto __expi = __to_intrin(__cvt(__exp));
2620 using _Up = __bool_storage_member_type_t<_Np>;
2621 constexpr _Up __k1 = _Np < sizeof(_Up) * __CHAR_BIT__ ? _Up((1ULL << _Np) - 1) : ~_Up();
2622 if constexpr (sizeof(__xi) == 16)
2623 {
2624 if constexpr (sizeof(_Tp) == 8)
2625 return _mm_maskz_scalef_pd(__k1, __xi, __expi);
2626 else
2627 return _mm_maskz_scalef_ps(__k1, __xi, __expi);
2628 }
2629 else if constexpr (sizeof(__xi) == 32)
2630 {
2631 if constexpr (sizeof(_Tp) == 8)
2632 return _mm256_maskz_scalef_pd(__k1, __xi, __expi);
2633 else
2634 return _mm256_maskz_scalef_ps(__k1, __xi, __expi);
2635 }
2636 else
2637 {
2638 static_assert(sizeof(__xi) == 64);
2639 if constexpr (sizeof(_Tp) == 8)
2640 return _mm512_maskz_scalef_pd(__k1, __xi, __expi);
2641 else
2642 return _mm512_maskz_scalef_ps(__k1, __xi, __expi);
2643 }
2644 }
2645 else
2646 return _Base::_S_ldexp(__x, __exp);
2647 }
2648
2649 // }}}
2650 // _S_trunc {{{
2651 template <typename _Tp, size_t _Np>
2652 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
2653 _S_trunc(_SimdWrapper<_Tp, _Np> __x)
2654 {
2655 if constexpr (__is_avx512_ps<_Tp, _Np>())
2656 return _mm512_roundscale_ps(__x, 0x0b);
2657 else if constexpr (__is_avx512_pd<_Tp, _Np>())
2658 return _mm512_roundscale_pd(__x, 0x0b);
2659 else if constexpr (__is_avx_ps<_Tp, _Np>())
2660 return _mm256_round_ps(__x, 0xb);
2661 else if constexpr (__is_avx_pd<_Tp, _Np>())
2662 return _mm256_round_pd(__x, 0xb);
2663 else if constexpr (__have_sse4_1 && __is_sse_ps<_Tp, _Np>())
2664 return __auto_bitcast(_mm_round_ps(__to_intrin(__x), 0xb));
2665 else if constexpr (__have_sse4_1 && __is_sse_pd<_Tp, _Np>())
2666 return _mm_round_pd(__x, 0xb);
2667 else if constexpr (__is_sse_ps<_Tp, _Np>())
2668 {
2669 auto __truncated
2670 = _mm_cvtepi32_ps(_mm_cvttps_epi32(__to_intrin(__x)));
2671 const auto __no_fractional_values
2672 = __vector_bitcast<int>(__vector_bitcast<_UInt>(__to_intrin(__x))
2673 & 0x7f800000u)
2674 < 0x4b000000; // the exponent is so large that no mantissa bits
2675 // signify fractional values (0x3f8 + 23*8 =
2676 // 0x4b0)
2677 return __no_fractional_values ? __truncated : __to_intrin(__x);
2678 }
2679 else
2680 return _Base::_S_trunc(__x);
2681 }
2682
2683 // }}}
2684 // _S_round {{{
2685 template <typename _Tp, size_t _Np>
2686 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
2687 _S_round(_SimdWrapper<_Tp, _Np> __x)
2688 {
2689 // Note that _MM_FROUND_TO_NEAREST_INT rounds ties to even, not away
2690 // from zero as required by std::round. Therefore this function is more
2691 // complicated.
2692 using _V = __vector_type_t<_Tp, _Np>;
2693 _V __truncated;
2694 if constexpr (__is_avx512_ps<_Tp, _Np>())
2695 __truncated = _mm512_roundscale_ps(__x._M_data, 0x0b);
2696 else if constexpr (__is_avx512_pd<_Tp, _Np>())
2697 __truncated = _mm512_roundscale_pd(__x._M_data, 0x0b);
2698 else if constexpr (__is_avx_ps<_Tp, _Np>())
2699 __truncated = _mm256_round_ps(__x._M_data,
2700 _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
2701 else if constexpr (__is_avx_pd<_Tp, _Np>())
2702 __truncated = _mm256_round_pd(__x._M_data,
2703 _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
2704 else if constexpr (__have_sse4_1 && __is_sse_ps<_Tp, _Np>())
2705 __truncated = __auto_bitcast(
2706 _mm_round_ps(__to_intrin(__x),
2707 _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC));
2708 else if constexpr (__have_sse4_1 && __is_sse_pd<_Tp, _Np>())
2709 __truncated
2710 = _mm_round_pd(__x._M_data, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
2711 else if constexpr (__is_sse_ps<_Tp, _Np>())
2712 __truncated = __auto_bitcast(
2713 _mm_cvtepi32_ps(_mm_cvttps_epi32(__to_intrin(__x))));
2714 else
2715 return _Base::_S_round(__x);
2716
2717 // x < 0 => truncated <= 0 && truncated >= x => x - truncated <= 0
2718 // x > 0 => truncated >= 0 && truncated <= x => x - truncated >= 0
2719
2720 const _V __rounded
2721 = __truncated
2722 + (__and(_S_absmask<_V>, __x._M_data - __truncated) >= _Tp(.5)
2723 ? __or(__and(_S_signmask<_V>, __x._M_data), _V() + 1)
2724 : _V());
2725 if constexpr (__have_sse4_1)
2726 return __rounded;
2727 else // adjust for missing range in cvttps_epi32
2728 return __and(_S_absmask<_V>, __x._M_data) < 0x1p23f ? __rounded
2729 : __x._M_data;
2730 }
2731
2732 // }}}
2733 // _S_nearbyint {{{
2734 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
2735 _GLIBCXX_SIMD_INTRINSIC static _Tp _S_nearbyint(_Tp __x) noexcept
2736 {
2737 if constexpr (_TVT::template _S_is<float, 16>)
2738 return _mm512_roundscale_ps(__x, 0x0c);
2739 else if constexpr (_TVT::template _S_is<double, 8>)
2740 return _mm512_roundscale_pd(__x, 0x0c);
2741 else if constexpr (_TVT::template _S_is<float, 8>)
2742 return _mm256_round_ps(__x,
2743 _MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC);
2744 else if constexpr (_TVT::template _S_is<double, 4>)
2745 return _mm256_round_pd(__x,
2746 _MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC);
2747 else if constexpr (__have_sse4_1 && _TVT::template _S_is<float, 4>)
2748 return _mm_round_ps(__x,
2749 _MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC);
2750 else if constexpr (__have_sse4_1 && _TVT::template _S_is<double, 2>)
2751 return _mm_round_pd(__x,
2752 _MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC);
2753 else
2754 return _Base::_S_nearbyint(__x);
2755 }
2756
2757 // }}}
2758 // _S_rint {{{
2759 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
2760 _GLIBCXX_SIMD_INTRINSIC static _Tp _S_rint(_Tp __x) noexcept
2761 {
2762 if constexpr (_TVT::template _S_is<float, 16>)
2763 return _mm512_roundscale_ps(__x, 0x04);
2764 else if constexpr (_TVT::template _S_is<double, 8>)
2765 return _mm512_roundscale_pd(__x, 0x04);
2766 else if constexpr (_TVT::template _S_is<float, 8>)
2767 return _mm256_round_ps(__x, _MM_FROUND_CUR_DIRECTION);
2768 else if constexpr (_TVT::template _S_is<double, 4>)
2769 return _mm256_round_pd(__x, _MM_FROUND_CUR_DIRECTION);
2770 else if constexpr (__have_sse4_1 && _TVT::template _S_is<float, 4>)
2771 return _mm_round_ps(__x, _MM_FROUND_CUR_DIRECTION);
2772 else if constexpr (__have_sse4_1 && _TVT::template _S_is<double, 2>)
2773 return _mm_round_pd(__x, _MM_FROUND_CUR_DIRECTION);
2774 else
2775 return _Base::_S_rint(__x);
2776 }
2777
2778 // }}}
2779 // _S_floor {{{
2780 template <typename _Tp, size_t _Np>
2781 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
2782 _S_floor(_SimdWrapper<_Tp, _Np> __x)
2783 {
2784 if constexpr (__is_avx512_ps<_Tp, _Np>())
2785 return _mm512_roundscale_ps(__x, 0x09);
2786 else if constexpr (__is_avx512_pd<_Tp, _Np>())
2787 return _mm512_roundscale_pd(__x, 0x09);
2788 else if constexpr (__is_avx_ps<_Tp, _Np>())
2789 return _mm256_round_ps(__x, 0x9);
2790 else if constexpr (__is_avx_pd<_Tp, _Np>())
2791 return _mm256_round_pd(__x, 0x9);
2792 else if constexpr (__have_sse4_1 && __is_sse_ps<_Tp, _Np>())
2793 return __auto_bitcast(_mm_round_ps(__to_intrin(__x), 0x9));
2794 else if constexpr (__have_sse4_1 && __is_sse_pd<_Tp, _Np>())
2795 return _mm_round_pd(__x, 0x9);
2796 else
2797 return _Base::_S_floor(__x);
2798 }
2799
2800 // }}}
2801 // _S_ceil {{{
2802 template <typename _Tp, size_t _Np>
2803 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
2804 _S_ceil(_SimdWrapper<_Tp, _Np> __x)
2805 {
2806 if constexpr (__is_avx512_ps<_Tp, _Np>())
2807 return _mm512_roundscale_ps(__x, 0x0a);
2808 else if constexpr (__is_avx512_pd<_Tp, _Np>())
2809 return _mm512_roundscale_pd(__x, 0x0a);
2810 else if constexpr (__is_avx_ps<_Tp, _Np>())
2811 return _mm256_round_ps(__x, 0xa);
2812 else if constexpr (__is_avx_pd<_Tp, _Np>())
2813 return _mm256_round_pd(__x, 0xa);
2814 else if constexpr (__have_sse4_1 && __is_sse_ps<_Tp, _Np>())
2815 return __auto_bitcast(_mm_round_ps(__to_intrin(__x), 0xa));
2816 else if constexpr (__have_sse4_1 && __is_sse_pd<_Tp, _Np>())
2817 return _mm_round_pd(__x, 0xa);
2818 else
2819 return _Base::_S_ceil(__x);
2820 }
2821
2822 // }}}
2823 // _S_signbit {{{
2824 template <typename _Tp, size_t _Np>
2825 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
2826 _S_signbit(_SimdWrapper<_Tp, _Np> __x)
2827 {
2828 if constexpr (__is_avx512_abi<_Abi>() && __have_avx512dq)
2829 {
2830 if constexpr (sizeof(__x) == 64 && sizeof(_Tp) == 4)
2831 return _mm512_movepi32_mask(
2832 __intrin_bitcast<__m512i>(__x._M_data));
2833 else if constexpr (sizeof(__x) == 64 && sizeof(_Tp) == 8)
2834 return _mm512_movepi64_mask(
2835 __intrin_bitcast<__m512i>(__x._M_data));
2836 else if constexpr (sizeof(__x) == 32 && sizeof(_Tp) == 4)
2837 return _mm256_movepi32_mask(
2838 __intrin_bitcast<__m256i>(__x._M_data));
2839 else if constexpr (sizeof(__x) == 32 && sizeof(_Tp) == 8)
2840 return _mm256_movepi64_mask(
2841 __intrin_bitcast<__m256i>(__x._M_data));
2842 else if constexpr (sizeof(__x) <= 16 && sizeof(_Tp) == 4)
2843 return _mm_movepi32_mask(__intrin_bitcast<__m128i>(__x._M_data));
2844 else if constexpr (sizeof(__x) <= 16 && sizeof(_Tp) == 8)
2845 return _mm_movepi64_mask(__intrin_bitcast<__m128i>(__x._M_data));
2846 }
2847 else if constexpr (__is_avx512_abi<_Abi>())
2848 {
2849 const auto __xi = __to_intrin(__x);
2850 [[maybe_unused]] constexpr auto __k1
2851 = _Abi::template _S_implicit_mask_intrin<_Tp>();
2852 if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
2853 return _mm_movemask_ps(__xi);
2854 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
2855 return _mm_movemask_pd(__xi);
2856 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
2857 return _mm256_movemask_ps(__xi);
2858 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
2859 return _mm256_movemask_pd(__xi);
2860 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
2861 return _mm512_mask_cmplt_epi32_mask(
2862 __k1, __intrin_bitcast<__m512i>(__xi), __m512i());
2863 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
2864 return _mm512_mask_cmplt_epi64_mask(
2865 __k1, __intrin_bitcast<__m512i>(__xi), __m512i());
2866 else
2867 __assert_unreachable<_Tp>();
2868 }
2869 else
2870 return _Base::_S_signbit(__x);
2871 /*{
2872 using _I = __int_for_sizeof_t<_Tp>;
2873 if constexpr (sizeof(__x) == 64)
2874 return _S_less(__vector_bitcast<_I>(__x), _I());
2875 else
2876 {
2877 const auto __xx = __vector_bitcast<_I>(__x._M_data);
2878 [[maybe_unused]] constexpr _I __signmask = __finite_min_v<_I>;
2879 if constexpr ((sizeof(_Tp) == 4 &&
2880 (__have_avx2 || sizeof(__x) == 16)) ||
2881 __have_avx512vl)
2882 {
2883 return __vector_bitcast<_Tp>(__xx >> __digits_v<_I>);
2884 }
2885 else if constexpr ((__have_avx2 ||
2886 (__have_ssse3 && sizeof(__x) == 16)))
2887 {
2888 return __vector_bitcast<_Tp>((__xx & __signmask) ==
2889 __signmask);
2890 }
2891 else
2892 { // SSE2/3 or AVX (w/o AVX2)
2893 constexpr auto __one = __vector_broadcast<_Np, _Tp>(1);
2894 return __vector_bitcast<_Tp>(
2895 __vector_bitcast<_Tp>(
2896 (__xx & __signmask) |
2897 __vector_bitcast<_I>(__one)) // -1 or 1
2898 != __one);
2899 }
2900 }
2901 }*/
2902 }
2903
2904 // }}}
2905 // _S_isnonzerovalue_mask {{{
2906 // (isnormal | is subnormal == !isinf & !isnan & !is zero)
2907 template <typename _Tp>
2908 _GLIBCXX_SIMD_INTRINSIC static auto _S_isnonzerovalue_mask(_Tp __x)
2909 {
2910 using _Traits = _VectorTraits<_Tp>;
2911 if constexpr (__have_avx512dq_vl)
2912 {
2913 if constexpr (_Traits::template _S_is<
2914 float, 2> || _Traits::template _S_is<float, 4>)
2915 return _knot_mask8(_mm_fpclass_ps_mask(__to_intrin(__x), 0x9f));
2916 else if constexpr (_Traits::template _S_is<float, 8>)
2917 return _knot_mask8(_mm256_fpclass_ps_mask(__x, 0x9f));
2918 else if constexpr (_Traits::template _S_is<float, 16>)
2919 return _knot_mask16(_mm512_fpclass_ps_mask(__x, 0x9f));
2920 else if constexpr (_Traits::template _S_is<double, 2>)
2921 return _knot_mask8(_mm_fpclass_pd_mask(__x, 0x9f));
2922 else if constexpr (_Traits::template _S_is<double, 4>)
2923 return _knot_mask8(_mm256_fpclass_pd_mask(__x, 0x9f));
2924 else if constexpr (_Traits::template _S_is<double, 8>)
2925 return _knot_mask8(_mm512_fpclass_pd_mask(__x, 0x9f));
2926 else
2927 __assert_unreachable<_Tp>();
2928 }
2929 else
2930 {
2931 using _Up = typename _Traits::value_type;
2932 constexpr size_t _Np = _Traits::_S_full_size;
2933 const auto __a = __x * __infinity_v<_Up>; // NaN if __x == 0
2934 const auto __b = __x * _Up(); // NaN if __x == inf
2935 if constexpr (__have_avx512vl && __is_sse_ps<_Up, _Np>())
2936 return _mm_cmp_ps_mask(__to_intrin(__a), __to_intrin(__b),
2937 _CMP_ORD_Q);
2938 else if constexpr (__have_avx512f && __is_sse_ps<_Up, _Np>())
2939 return __mmask8(0xf
2940 & _mm512_cmp_ps_mask(__auto_bitcast(__a),
2941 __auto_bitcast(__b),
2942 _CMP_ORD_Q));
2943 else if constexpr (__have_avx512vl && __is_sse_pd<_Up, _Np>())
2944 return _mm_cmp_pd_mask(__a, __b, _CMP_ORD_Q);
2945 else if constexpr (__have_avx512f && __is_sse_pd<_Up, _Np>())
2946 return __mmask8(0x3
2947 & _mm512_cmp_pd_mask(__auto_bitcast(__a),
2948 __auto_bitcast(__b),
2949 _CMP_ORD_Q));
2950 else if constexpr (__have_avx512vl && __is_avx_ps<_Up, _Np>())
2951 return _mm256_cmp_ps_mask(__a, __b, _CMP_ORD_Q);
2952 else if constexpr (__have_avx512f && __is_avx_ps<_Up, _Np>())
2953 return __mmask8(_mm512_cmp_ps_mask(__auto_bitcast(__a),
2954 __auto_bitcast(__b),
2955 _CMP_ORD_Q));
2956 else if constexpr (__have_avx512vl && __is_avx_pd<_Up, _Np>())
2957 return _mm256_cmp_pd_mask(__a, __b, _CMP_ORD_Q);
2958 else if constexpr (__have_avx512f && __is_avx_pd<_Up, _Np>())
2959 return __mmask8(0xf
2960 & _mm512_cmp_pd_mask(__auto_bitcast(__a),
2961 __auto_bitcast(__b),
2962 _CMP_ORD_Q));
2963 else if constexpr (__is_avx512_ps<_Up, _Np>())
2964 return _mm512_cmp_ps_mask(__a, __b, _CMP_ORD_Q);
2965 else if constexpr (__is_avx512_pd<_Up, _Np>())
2966 return _mm512_cmp_pd_mask(__a, __b, _CMP_ORD_Q);
2967 else
2968 __assert_unreachable<_Tp>();
2969 }
2970 }
2971
2972 // }}}
2973 // _S_isfinite {{{
2974 template <typename _Tp, size_t _Np>
2975 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
2976 _S_isfinite(_SimdWrapper<_Tp, _Np> __x)
2977 {
2978 static_assert(is_floating_point_v<_Tp>);
2979#if !__FINITE_MATH_ONLY__
2980 if constexpr (__is_avx512_abi<_Abi>() && __have_avx512dq)
2981 {
2982 const auto __xi = __to_intrin(__x);
2983 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
2984 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
2985 return __k1 ^ _mm512_mask_fpclass_ps_mask(__k1, __xi, 0x99);
2986 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
2987 return __k1 ^ _mm512_mask_fpclass_pd_mask(__k1, __xi, 0x99);
2988 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
2989 return __k1 ^ _mm256_mask_fpclass_ps_mask(__k1, __xi, 0x99);
2990 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
2991 return __k1 ^ _mm256_mask_fpclass_pd_mask(__k1, __xi, 0x99);
2992 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
2993 return __k1 ^ _mm_mask_fpclass_ps_mask(__k1, __xi, 0x99);
2994 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
2995 return __k1 ^ _mm_mask_fpclass_pd_mask(__k1, __xi, 0x99);
2996 }
2997 else if constexpr (__is_avx512_abi<_Abi>())
2998 {
2999 // if all exponent bits are set, __x is either inf or NaN
3000 using _I = __int_for_sizeof_t<_Tp>;
3001 const auto __inf = __vector_bitcast<_I>(
3002 __vector_broadcast<_Np>(__infinity_v<_Tp>));
3003 return _S_less<_I, _Np>(__vector_bitcast<_I>(__x) & __inf, __inf);
3004 }
3005 else
3006#endif
3007 return _Base::_S_isfinite(__x);
3008 }
3009
3010 // }}}
3011 // _S_isinf {{{
3012 template <typename _Tp, size_t _Np>
3013 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
3014 _S_isinf(_SimdWrapper<_Tp, _Np> __x)
3015 {
3016#if !__FINITE_MATH_ONLY__
3017 if constexpr (__is_avx512_abi<_Abi>() && __have_avx512dq)
3018 {
3019 const auto __xi = __to_intrin(__x);
3020 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
3021 return _mm512_fpclass_ps_mask(__xi, 0x18);
3022 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
3023 return _mm512_fpclass_pd_mask(__xi, 0x18);
3024 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3025 return _mm256_fpclass_ps_mask(__xi, 0x18);
3026 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3027 return _mm256_fpclass_pd_mask(__xi, 0x18);
3028 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3029 return _mm_fpclass_ps_mask(__xi, 0x18);
3030 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3031 return _mm_fpclass_pd_mask(__xi, 0x18);
3032 else
3033 __assert_unreachable<_Tp>();
3034 }
3035 else if constexpr (__have_avx512dq_vl)
3036 {
3037 if constexpr (__is_sse_pd<_Tp, _Np>())
3038 return _mm_movm_epi64(_mm_fpclass_pd_mask(__x, 0x18));
3039 else if constexpr (__is_avx_pd<_Tp, _Np>())
3040 return _mm256_movm_epi64(_mm256_fpclass_pd_mask(__x, 0x18));
3041 else if constexpr (__is_sse_ps<_Tp, _Np>())
3042 return _mm_movm_epi32(
3043 _mm_fpclass_ps_mask(__to_intrin(__x), 0x18));
3044 else if constexpr (__is_avx_ps<_Tp, _Np>())
3045 return _mm256_movm_epi32(_mm256_fpclass_ps_mask(__x, 0x18));
3046 else
3047 __assert_unreachable<_Tp>();
3048 }
3049 else
3050#endif
3051 return _Base::_S_isinf(__x);
3052 }
3053
3054 // }}}
3055 // _S_isnormal {{{
3056 template <typename _Tp, size_t _Np>
3057 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
3058 _S_isnormal(_SimdWrapper<_Tp, _Np> __x)
3059 {
3060#if __FINITE_MATH_ONLY__
3061 [[maybe_unused]] constexpr int __mode = 0x26;
3062#else
3063 [[maybe_unused]] constexpr int __mode = 0xbf;
3064#endif
3065 if constexpr (__is_avx512_abi<_Abi>() && __have_avx512dq)
3066 {
3067 const auto __xi = __to_intrin(__x);
3068 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
3069 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
3070 return __k1 ^ _mm512_mask_fpclass_ps_mask(__k1, __xi, __mode);
3071 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
3072 return __k1 ^ _mm512_mask_fpclass_pd_mask(__k1, __xi, __mode);
3073 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3074 return __k1 ^ _mm256_mask_fpclass_ps_mask(__k1, __xi, __mode);
3075 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3076 return __k1 ^ _mm256_mask_fpclass_pd_mask(__k1, __xi, __mode);
3077 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3078 return __k1 ^ _mm_mask_fpclass_ps_mask(__k1, __xi, __mode);
3079 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3080 return __k1 ^ _mm_mask_fpclass_pd_mask(__k1, __xi, __mode);
3081 else
3082 __assert_unreachable<_Tp>();
3083 }
3084 else if constexpr (__have_avx512dq)
3085 {
3086 if constexpr (__have_avx512vl && __is_sse_ps<_Tp, _Np>())
3087 return _mm_movm_epi32(
3088 _knot_mask8(_mm_fpclass_ps_mask(__to_intrin(__x), __mode)));
3089 else if constexpr (__have_avx512vl && __is_avx_ps<_Tp, _Np>())
3090 return _mm256_movm_epi32(
3091 _knot_mask8(_mm256_fpclass_ps_mask(__x, __mode)));
3092 else if constexpr (__is_avx512_ps<_Tp, _Np>())
3093 return _knot_mask16(_mm512_fpclass_ps_mask(__x, __mode));
3094 else if constexpr (__have_avx512vl && __is_sse_pd<_Tp, _Np>())
3095 return _mm_movm_epi64(
3096 _knot_mask8(_mm_fpclass_pd_mask(__x, __mode)));
3097 else if constexpr (__have_avx512vl && __is_avx_pd<_Tp, _Np>())
3098 return _mm256_movm_epi64(
3099 _knot_mask8(_mm256_fpclass_pd_mask(__x, __mode)));
3100 else if constexpr (__is_avx512_pd<_Tp, _Np>())
3101 return _knot_mask8(_mm512_fpclass_pd_mask(__x, __mode));
3102 else
3103 __assert_unreachable<_Tp>();
3104 }
3105 else if constexpr (__is_avx512_abi<_Abi>())
3106 {
3107 using _I = __int_for_sizeof_t<_Tp>;
3108 const auto absn = __vector_bitcast<_I>(_S_abs(__x));
3109 const auto minn = __vector_bitcast<_I>(
3110 __vector_broadcast<_Np>(__norm_min_v<_Tp>));
3111#if __FINITE_MATH_ONLY__
3112 return _S_less_equal<_I, _Np>(minn, absn);
3113#else
3114 const auto infn
3115 = __vector_bitcast<_I>(__vector_broadcast<_Np>(__infinity_v<_Tp>));
3116 return __and(_S_less_equal<_I, _Np>(minn, absn),
3117 _S_less<_I, _Np>(absn, infn));
3118#endif
3119 }
3120 else
3121 return _Base::_S_isnormal(__x);
3122 }
3123
3124 // }}}
3125 // _S_isnan {{{
3126 template <typename _Tp, size_t _Np>
3127 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
3128 _S_isnan(_SimdWrapper<_Tp, _Np> __x)
3129 { return _S_isunordered(__x, __x); }
3130
3131 // }}}
3132 // _S_isunordered {{{
3133 template <typename _Tp, size_t _Np>
3134 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
3135 _S_isunordered([[maybe_unused]] _SimdWrapper<_Tp, _Np> __x,
3136 [[maybe_unused]] _SimdWrapper<_Tp, _Np> __y)
3137 {
3138#if __FINITE_MATH_ONLY__
3139 return {}; // false
3140#else
3141 const auto __xi = __to_intrin(__x);
3142 const auto __yi = __to_intrin(__y);
3143 if constexpr (__is_avx512_abi<_Abi>())
3144 {
3145 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
3146 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
3147 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_UNORD_Q);
3148 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
3149 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_UNORD_Q);
3150 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3151 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_UNORD_Q);
3152 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3153 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_UNORD_Q);
3154 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3155 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_UNORD_Q);
3156 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3157 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_UNORD_Q);
3158 }
3159 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3160 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_UNORD_Q));
3161 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3162 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_UNORD_Q));
3163 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3164 return __auto_bitcast(_mm_cmpunord_ps(__xi, __yi));
3165 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3166 return __to_masktype(_mm_cmpunord_pd(__xi, __yi));
3167 else
3168 __assert_unreachable<_Tp>();
3169#endif
3170 }
3171
3172 // }}}
3173 // _S_isgreater {{{
3174 template <typename _Tp, size_t _Np>
3175 static constexpr _MaskMember<_Tp> _S_isgreater(_SimdWrapper<_Tp, _Np> __x,
3176 _SimdWrapper<_Tp, _Np> __y)
3177 {
3178 const auto __xi = __to_intrin(__x);
3179 const auto __yi = __to_intrin(__y);
3180 if constexpr (__is_avx512_abi<_Abi>())
3181 {
3182 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
3183 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
3184 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GT_OQ);
3185 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
3186 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GT_OQ);
3187 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3188 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GT_OQ);
3189 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3190 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GT_OQ);
3191 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3192 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GT_OQ);
3193 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3194 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GT_OQ);
3195 else
3196 __assert_unreachable<_Tp>();
3197 }
3198 else if constexpr (__have_avx)
3199 {
3200 if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3201 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_GT_OQ));
3202 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3203 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_GT_OQ));
3204 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3205 return __auto_bitcast(_mm_cmp_ps(__xi, __yi, _CMP_GT_OQ));
3206 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3207 return __to_masktype(_mm_cmp_pd(__xi, __yi, _CMP_GT_OQ));
3208 else
3209 __assert_unreachable<_Tp>();
3210 }
3211 else if constexpr (__have_sse2 && sizeof(__xi) == 16
3212 && sizeof(_Tp) == 4)
3213 {
3214 const auto __xn = __vector_bitcast<int>(__xi);
3215 const auto __yn = __vector_bitcast<int>(__yi);
3216 const auto __xp = __xn < 0 ? -(__xn & 0x7fff'ffff) : __xn;
3217 const auto __yp = __yn < 0 ? -(__yn & 0x7fff'ffff) : __yn;
3218 return __auto_bitcast(
3219 __and(__to_masktype(_mm_cmpord_ps(__xi, __yi)), __xp > __yp));
3220 }
3221 else if constexpr (__have_sse2 && sizeof(__xi) == 16
3222 && sizeof(_Tp) == 8)
3223 return __vector_type_t<__int_with_sizeof_t<8>, 2>{
3224 -_mm_ucomigt_sd(__xi, __yi),
3225 -_mm_ucomigt_sd(_mm_unpackhi_pd(__xi, __xi),
3226 _mm_unpackhi_pd(__yi, __yi))};
3227 else
3228 return _Base::_S_isgreater(__x, __y);
3229 }
3230
3231 // }}}
3232 // _S_isgreaterequal {{{
3233 template <typename _Tp, size_t _Np>
3234 static constexpr _MaskMember<_Tp>
3235 _S_isgreaterequal(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
3236 {
3237 const auto __xi = __to_intrin(__x);
3238 const auto __yi = __to_intrin(__y);
3239 if constexpr (__is_avx512_abi<_Abi>())
3240 {
3241 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
3242 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
3243 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GE_OQ);
3244 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
3245 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GE_OQ);
3246 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3247 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GE_OQ);
3248 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3249 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GE_OQ);
3250 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3251 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GE_OQ);
3252 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3253 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GE_OQ);
3254 else
3255 __assert_unreachable<_Tp>();
3256 }
3257 else if constexpr (__have_avx)
3258 {
3259 if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3260 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_GE_OQ));
3261 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3262 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_GE_OQ));
3263 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3264 return __auto_bitcast(_mm_cmp_ps(__xi, __yi, _CMP_GE_OQ));
3265 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3266 return __to_masktype(_mm_cmp_pd(__xi, __yi, _CMP_GE_OQ));
3267 else
3268 __assert_unreachable<_Tp>();
3269 }
3270 else if constexpr (__have_sse2 && sizeof(__xi) == 16
3271 && sizeof(_Tp) == 4)
3272 {
3273 const auto __xn = __vector_bitcast<int>(__xi);
3274 const auto __yn = __vector_bitcast<int>(__yi);
3275 const auto __xp = __xn < 0 ? -(__xn & 0x7fff'ffff) : __xn;
3276 const auto __yp = __yn < 0 ? -(__yn & 0x7fff'ffff) : __yn;
3277 return __auto_bitcast(
3278 __and(__to_masktype(_mm_cmpord_ps(__xi, __yi)), __xp >= __yp));
3279 }
3280 else if constexpr (__have_sse2 && sizeof(__xi) == 16
3281 && sizeof(_Tp) == 8)
3282 return __vector_type_t<__int_with_sizeof_t<8>, 2>{
3283 -_mm_ucomige_sd(__xi, __yi),
3284 -_mm_ucomige_sd(_mm_unpackhi_pd(__xi, __xi),
3285 _mm_unpackhi_pd(__yi, __yi))};
3286 else
3287 return _Base::_S_isgreaterequal(__x, __y);
3288 }
3289
3290 // }}}
3291 // _S_isless {{{
3292 template <typename _Tp, size_t _Np>
3293 static constexpr _MaskMember<_Tp> _S_isless(_SimdWrapper<_Tp, _Np> __x,
3294 _SimdWrapper<_Tp, _Np> __y)
3295 {
3296 const auto __xi = __to_intrin(__x);
3297 const auto __yi = __to_intrin(__y);
3298 if constexpr (__is_avx512_abi<_Abi>())
3299 {
3300 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
3301 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
3302 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OQ);
3303 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
3304 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OQ);
3305 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3306 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OQ);
3307 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3308 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OQ);
3309 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3310 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OQ);
3311 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3312 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OQ);
3313 else
3314 __assert_unreachable<_Tp>();
3315 }
3316 else if constexpr (__have_avx)
3317 {
3318 if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3319 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_LT_OQ));
3320 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3321 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_LT_OQ));
3322 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3323 return __auto_bitcast(_mm_cmp_ps(__xi, __yi, _CMP_LT_OQ));
3324 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3325 return __to_masktype(_mm_cmp_pd(__xi, __yi, _CMP_LT_OQ));
3326 else
3327 __assert_unreachable<_Tp>();
3328 }
3329 else if constexpr (__have_sse2 && sizeof(__xi) == 16
3330 && sizeof(_Tp) == 4)
3331 {
3332 const auto __xn = __vector_bitcast<int>(__xi);
3333 const auto __yn = __vector_bitcast<int>(__yi);
3334 const auto __xp = __xn < 0 ? -(__xn & 0x7fff'ffff) : __xn;
3335 const auto __yp = __yn < 0 ? -(__yn & 0x7fff'ffff) : __yn;
3336 return __auto_bitcast(
3337 __and(__to_masktype(_mm_cmpord_ps(__xi, __yi)), __xp < __yp));
3338 }
3339 else if constexpr (__have_sse2 && sizeof(__xi) == 16
3340 && sizeof(_Tp) == 8)
3341 return __vector_type_t<__int_with_sizeof_t<8>, 2>{
3342 -_mm_ucomigt_sd(__yi, __xi),
3343 -_mm_ucomigt_sd(_mm_unpackhi_pd(__yi, __yi),
3344 _mm_unpackhi_pd(__xi, __xi))};
3345 else
3346 return _Base::_S_isless(__x, __y);
3347 }
3348
3349 // }}}
3350 // _S_islessequal {{{
3351 template <typename _Tp, size_t _Np>
3352 static constexpr _MaskMember<_Tp>
3353 _S_islessequal(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
3354 {
3355 const auto __xi = __to_intrin(__x);
3356 const auto __yi = __to_intrin(__y);
3357 if constexpr (__is_avx512_abi<_Abi>())
3358 {
3359 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
3360 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
3361 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OQ);
3362 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
3363 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OQ);
3364 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3365 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OQ);
3366 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3367 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OQ);
3368 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3369 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OQ);
3370 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3371 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OQ);
3372 else
3373 __assert_unreachable<_Tp>();
3374 }
3375 else if constexpr (__have_avx)
3376 {
3377 if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3378 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_LE_OQ));
3379 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3380 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_LE_OQ));
3381 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3382 return __auto_bitcast(_mm_cmp_ps(__xi, __yi, _CMP_LE_OQ));
3383 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3384 return __to_masktype(_mm_cmp_pd(__xi, __yi, _CMP_LE_OQ));
3385 else
3386 __assert_unreachable<_Tp>();
3387 }
3388 else if constexpr (__have_sse2 && sizeof(__xi) == 16
3389 && sizeof(_Tp) == 4)
3390 {
3391 const auto __xn = __vector_bitcast<int>(__xi);
3392 const auto __yn = __vector_bitcast<int>(__yi);
3393 const auto __xp = __xn < 0 ? -(__xn & 0x7fff'ffff) : __xn;
3394 const auto __yp = __yn < 0 ? -(__yn & 0x7fff'ffff) : __yn;
3395 return __auto_bitcast(
3396 __and(__to_masktype(_mm_cmpord_ps(__xi, __yi)), __xp <= __yp));
3397 }
3398 else if constexpr (__have_sse2 && sizeof(__xi) == 16
3399 && sizeof(_Tp) == 8)
3400 return __vector_type_t<__int_with_sizeof_t<8>, 2>{
3401 -_mm_ucomige_sd(__yi, __xi),
3402 -_mm_ucomige_sd(_mm_unpackhi_pd(__yi, __yi),
3403 _mm_unpackhi_pd(__xi, __xi))};
3404 else
3405 return _Base::_S_islessequal(__x, __y);
3406 }
3407
3408 // }}}
3409 // _S_islessgreater {{{
3410 template <typename _Tp, size_t _Np>
3411 static constexpr _MaskMember<_Tp>
3412 _S_islessgreater(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
3413 {
3414 const auto __xi = __to_intrin(__x);
3415 const auto __yi = __to_intrin(__y);
3416 if constexpr (__is_avx512_abi<_Abi>())
3417 {
3418 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>();
3419 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4)
3420 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_OQ);
3421 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8)
3422 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_OQ);
3423 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3424 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_OQ);
3425 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3426 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_OQ);
3427 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3428 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_OQ);
3429 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3430 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_OQ);
3431 else
3432 __assert_unreachable<_Tp>();
3433 }
3434 else if constexpr (__have_avx)
3435 {
3436 if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4)
3437 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_NEQ_OQ));
3438 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8)
3439 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_NEQ_OQ));
3440 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3441 return __auto_bitcast(_mm_cmp_ps(__xi, __yi, _CMP_NEQ_OQ));
3442 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3443 return __to_masktype(_mm_cmp_pd(__xi, __yi, _CMP_NEQ_OQ));
3444 else
3445 __assert_unreachable<_Tp>();
3446 }
3447 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4)
3448 return __auto_bitcast(
3449 __and(_mm_cmpord_ps(__xi, __yi), _mm_cmpneq_ps(__xi, __yi)));
3450 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8)
3451 return __to_masktype(
3452 __and(_mm_cmpord_pd(__xi, __yi), _mm_cmpneq_pd(__xi, __yi)));
3453 else
3454 __assert_unreachable<_Tp>();
3455 }
3456
3457 //}}} }}}
3458 };
3459
3460// }}}
3461// _MaskImplX86Mixin {{{
3462struct _MaskImplX86Mixin
3463{
3464 template <typename _Tp>
3465 using _TypeTag = _Tp*;
3466
3467 using _Base = _MaskImplBuiltinMixin;
3468
3469 // _S_to_maskvector(bool) {{{
3470 template <typename _Up, size_t _ToN = 1, typename _Tp>
3471 _GLIBCXX_SIMD_INTRINSIC static constexpr enable_if_t<
3472 is_same_v<_Tp, bool>, _SimdWrapper<_Up, _ToN>>
3473 _S_to_maskvector(_Tp __x)
3474 {
3475 static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>);
3476 return __x ? __vector_type_t<_Up, _ToN>{~_Up()}
3477 : __vector_type_t<_Up, _ToN>();
3478 }
3479
3480 // }}}
3481 // _S_to_maskvector(_SanitizedBitMask) {{{
3482 template <typename _Up, size_t _UpN = 0, size_t _Np,
3483 size_t _ToN = _UpN == 0 ? _Np : _UpN>
3484 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Up, _ToN>
3485 _S_to_maskvector(_SanitizedBitMask<_Np> __x)
3486 {
3487 static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>);
3488 using _UV = __vector_type_t<_Up, _ToN>;
3489 using _UI = __intrinsic_type_t<_Up, _ToN>;
3490 [[maybe_unused]] const auto __k = __x._M_to_bits();
3491 if constexpr (_Np == 1)
3492 return _S_to_maskvector<_Up, _ToN>(__k);
3493 else if (__x._M_is_constprop() || __builtin_is_constant_evaluated())
3494 return __generate_from_n_evaluations<std::min(_ToN, _Np), _UV>(
3495 [&](auto __i) -> _Up { return -__x[__i.value]; });
3496 else if constexpr (sizeof(_Up) == 1)
3497 {
3498 if constexpr (sizeof(_UI) == 16)
3499 {
3500 if constexpr (__have_avx512bw_vl)
3501 return __intrin_bitcast<_UV>(_mm_movm_epi8(__k));
3502 else if constexpr (__have_avx512bw)
3503 return __intrin_bitcast<_UV>(__lo128(_mm512_movm_epi8(__k)));
3504 else if constexpr (__have_avx512f)
3505 {
3506 auto __as32bits = _mm512_maskz_mov_epi32(__k, ~__m512i());
3507 auto __as16bits
3508 = __xzyw(_mm256_packs_epi32(__lo256(__as32bits),
3509 __hi256(__as32bits)));
3510 return __intrin_bitcast<_UV>(
3511 _mm_packs_epi16(__lo128(__as16bits), __hi128(__as16bits)));
3512 }
3513 else if constexpr (__have_ssse3)
3514 {
3515 const auto __bitmask = __to_intrin(
3516 __make_vector<_UChar>(1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4,
3517 8, 16, 32, 64, 128));
3518 return __intrin_bitcast<_UV>(
3519 __vector_bitcast<_Up>(
3520 _mm_shuffle_epi8(__to_intrin(
3521 __vector_type_t<_ULLong, 2>{__k}),
3522 _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1,
3523 1, 1, 1, 1, 1, 1, 1))
3524 & __bitmask)
3525 != 0);
3526 }
3527 // else fall through
3528 }
3529 else if constexpr (sizeof(_UI) == 32)
3530 {
3531 if constexpr (__have_avx512bw_vl)
3532 return __vector_bitcast<_Up>(_mm256_movm_epi8(__k));
3533 else if constexpr (__have_avx512bw)
3534 return __vector_bitcast<_Up>(__lo256(_mm512_movm_epi8(__k)));
3535 else if constexpr (__have_avx512f)
3536 {
3537 auto __as16bits = // 0 16 1 17 ... 15 31
3538 _mm512_srli_epi32(_mm512_maskz_mov_epi32(__k, ~__m512i()),
3539 16)
3540 | _mm512_slli_epi32(_mm512_maskz_mov_epi32(__k >> 16,
3541 ~__m512i()),
3542 16);
3543 auto __0_16_1_17 = __xzyw(_mm256_packs_epi16(
3544 __lo256(__as16bits),
3545 __hi256(__as16bits)) // 0 16 1 17 2 18 3 19 8 24 9 25 ...
3546 );
3547 // deinterleave:
3548 return __vector_bitcast<_Up>(__xzyw(_mm256_shuffle_epi8(
3549 __0_16_1_17, // 0 16 1 17 2 ...
3550 _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9,
3551 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14, 1,
3552 3, 5, 7, 9, 11, 13,
3553 15)))); // 0-7 16-23 8-15 24-31 -> xzyw
3554 // 0-3 8-11 16-19 24-27
3555 // 4-7 12-15 20-23 28-31
3556 }
3557 else if constexpr (__have_avx2)
3558 {
3559 const auto __bitmask
3560 = _mm256_broadcastsi128_si256(__to_intrin(
3561 __make_vector<_UChar>(1, 2, 4, 8, 16, 32, 64, 128, 1, 2,
3562 4, 8, 16, 32, 64, 128)));
3563 return __vector_bitcast<_Up>(
3564 __vector_bitcast<_Up>(
3565 _mm256_shuffle_epi8(
3566 _mm256_broadcastsi128_si256(
3567 __to_intrin(__vector_type_t<_ULLong, 2>{__k})),
3568 _mm256_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
3569 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3,
3570 3, 3, 3, 3, 3, 3))
3571 & __bitmask)
3572 != 0);
3573 }
3574 // else fall through
3575 }
3576 else if constexpr (sizeof(_UI) == 64)
3577 return reinterpret_cast<_UV>(_mm512_movm_epi8(__k));
3578 if constexpr (std::min(_ToN, _Np) <= 4)
3579 {
3580 if constexpr (_Np > 7) // avoid overflow
3581 __x &= _SanitizedBitMask<_Np>(0x0f);
3582 const _UInt __char_mask
3583 = ((_UInt(__x.to_ulong()) * 0x00204081U) & 0x01010101ULL)
3584 * 0xff;
3585 _UV __r = {};
3586 __builtin_memcpy(&__r, &__char_mask,
3587 std::min(sizeof(__r), sizeof(__char_mask)));
3588 return __r;
3589 }
3590 else if constexpr (std::min(_ToN, _Np) <= 7)
3591 {
3592 if constexpr (_Np > 7) // avoid overflow
3593 __x &= _SanitizedBitMask<_Np>(0x7f);
3594 const _ULLong __char_mask
3595 = ((__x.to_ulong() * 0x40810204081ULL) & 0x0101010101010101ULL)
3596 * 0xff;
3597 _UV __r = {};
3598 __builtin_memcpy(&__r, &__char_mask,
3599 std::min(sizeof(__r), sizeof(__char_mask)));
3600 return __r;
3601 }
3602 }
3603 else if constexpr (sizeof(_Up) == 2)
3604 {
3605 if constexpr (sizeof(_UI) == 16)
3606 {
3607 if constexpr (__have_avx512bw_vl)
3608 return __intrin_bitcast<_UV>(_mm_movm_epi16(__k));
3609 else if constexpr (__have_avx512bw)
3610 return __intrin_bitcast<_UV>(__lo128(_mm512_movm_epi16(__k)));
3611 else if constexpr (__have_avx512f)
3612 {
3613 __m256i __as32bits = {};
3614 if constexpr (__have_avx512vl)
3615 __as32bits = _mm256_maskz_mov_epi32(__k, ~__m256i());
3616 else
3617 __as32bits
3618 = __lo256(_mm512_maskz_mov_epi32(__k, ~__m512i()));
3619 return __intrin_bitcast<_UV>(
3620 _mm_packs_epi32(__lo128(__as32bits), __hi128(__as32bits)));
3621 }
3622 // else fall through
3623 }
3624 else if constexpr (sizeof(_UI) == 32)
3625 {
3626 if constexpr (__have_avx512bw_vl)
3627 return __vector_bitcast<_Up>(_mm256_movm_epi16(__k));
3628 else if constexpr (__have_avx512bw)
3629 return __vector_bitcast<_Up>(__lo256(_mm512_movm_epi16(__k)));
3630 else if constexpr (__have_avx512f)
3631 {
3632 auto __as32bits = _mm512_maskz_mov_epi32(__k, ~__m512i());
3633 return __vector_bitcast<_Up>(
3634 __xzyw(_mm256_packs_epi32(__lo256(__as32bits),
3635 __hi256(__as32bits))));
3636 }
3637 // else fall through
3638 }
3639 else if constexpr (sizeof(_UI) == 64)
3640 return __vector_bitcast<_Up>(_mm512_movm_epi16(__k));
3641 }
3642 else if constexpr (sizeof(_Up) == 4)
3643 {
3644 if constexpr (sizeof(_UI) == 16)
3645 {
3646 if constexpr (__have_avx512dq_vl)
3647 return __intrin_bitcast<_UV>(_mm_movm_epi32(__k));
3648 else if constexpr (__have_avx512dq)
3649 return __intrin_bitcast<_UV>(__lo128(_mm512_movm_epi32(__k)));
3650 else if constexpr (__have_avx512vl)
3651 return __intrin_bitcast<_UV>(
3652 _mm_maskz_mov_epi32(__k, ~__m128i()));
3653 else if constexpr (__have_avx512f)
3654 return __intrin_bitcast<_UV>(
3655 __lo128(_mm512_maskz_mov_epi32(__k, ~__m512i())));
3656 // else fall through
3657 }
3658 else if constexpr (sizeof(_UI) == 32)
3659 {
3660 if constexpr (__have_avx512dq_vl)
3661 return __vector_bitcast<_Up>(_mm256_movm_epi32(__k));
3662 else if constexpr (__have_avx512dq)
3663 return __vector_bitcast<_Up>(__lo256(_mm512_movm_epi32(__k)));
3664 else if constexpr (__have_avx512vl)
3665 return __vector_bitcast<_Up>(
3666 _mm256_maskz_mov_epi32(__k, ~__m256i()));
3667 else if constexpr (__have_avx512f)
3668 return __vector_bitcast<_Up>(
3669 __lo256(_mm512_maskz_mov_epi32(__k, ~__m512i())));
3670 // else fall through
3671 }
3672 else if constexpr (sizeof(_UI) == 64)
3673 return __vector_bitcast<_Up>(
3674 __have_avx512dq ? _mm512_movm_epi32(__k)
3675 : _mm512_maskz_mov_epi32(__k, ~__m512i()));
3676 }
3677 else if constexpr (sizeof(_Up) == 8)
3678 {
3679 if constexpr (sizeof(_UI) == 16)
3680 {
3681 if constexpr (__have_avx512dq_vl)
3682 return __vector_bitcast<_Up>(_mm_movm_epi64(__k));
3683 else if constexpr (__have_avx512dq)
3684 return __vector_bitcast<_Up>(__lo128(_mm512_movm_epi64(__k)));
3685 else if constexpr (__have_avx512vl)
3686 return __vector_bitcast<_Up>(
3687 _mm_maskz_mov_epi64(__k, ~__m128i()));
3688 else if constexpr (__have_avx512f)
3689 return __vector_bitcast<_Up>(
3690 __lo128(_mm512_maskz_mov_epi64(__k, ~__m512i())));
3691 // else fall through
3692 }
3693 else if constexpr (sizeof(_UI) == 32)
3694 {
3695 if constexpr (__have_avx512dq_vl)
3696 return __vector_bitcast<_Up>(_mm256_movm_epi64(__k));
3697 else if constexpr (__have_avx512dq)
3698 return __vector_bitcast<_Up>(__lo256(_mm512_movm_epi64(__k)));
3699 else if constexpr (__have_avx512vl)
3700 return __vector_bitcast<_Up>(
3701 _mm256_maskz_mov_epi64(__k, ~__m256i()));
3702 else if constexpr (__have_avx512f)
3703 return __vector_bitcast<_Up>(
3704 __lo256(_mm512_maskz_mov_epi64(__k, ~__m512i())));
3705 // else fall through
3706 }
3707 else if constexpr (sizeof(_UI) == 64)
3708 return __vector_bitcast<_Up>(
3709 __have_avx512dq ? _mm512_movm_epi64(__k)
3710 : _mm512_maskz_mov_epi64(__k, ~__m512i()));
3711 }
3712
3713 using _UpUInt = make_unsigned_t<_Up>;
3714 using _V = __vector_type_t<_UpUInt, _ToN>;
3715 constexpr size_t __bits_per_element = sizeof(_Up) * __CHAR_BIT__;
3716 if constexpr (_ToN == 2)
3717 {
3718 return __vector_bitcast<_Up>(_V{_UpUInt(-__x[0]), _UpUInt(-__x[1])});
3719 }
3720 else if constexpr (!__have_avx2 && __have_avx && sizeof(_V) == 32)
3721 {
3722 if constexpr (sizeof(_Up) == 4)
3723 return __vector_bitcast<_Up>(_mm256_cmp_ps(
3724 _mm256_and_ps(_mm256_castsi256_ps(_mm256_set1_epi32(__k)),
3725 _mm256_castsi256_ps(_mm256_setr_epi32(
3726 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80))),
3727 _mm256_setzero_ps(), _CMP_NEQ_UQ));
3728 else if constexpr (sizeof(_Up) == 8)
3729 return __vector_bitcast<_Up>(_mm256_cmp_pd(
3730 _mm256_and_pd(_mm256_castsi256_pd(_mm256_set1_epi64x(__k)),
3731 _mm256_castsi256_pd(
3732 _mm256_setr_epi64x(0x01, 0x02, 0x04, 0x08))),
3733 _mm256_setzero_pd(), _CMP_NEQ_UQ));
3734 else
3735 __assert_unreachable<_Up>();
3736 }
3737 else if constexpr (__bits_per_element >= _ToN)
3738 {
3739 constexpr auto __bitmask
3740 = __generate_vector<_V>([](auto __i) constexpr->_UpUInt {
3741 return __i < _ToN ? 1ull << __i : 0;
3742 });
3743 const auto __bits
3744 = __vector_broadcast<_ToN, _UpUInt>(__k) & __bitmask;
3745 if constexpr (__bits_per_element > _ToN)
3746 return __vector_bitcast<_Up>(__bits) > 0;
3747 else
3748 return __vector_bitcast<_Up>(__bits != 0);
3749 }
3750 else
3751 {
3752 const _V __tmp
3753 = __generate_vector<_V>([&](auto __i) constexpr {
3754 return static_cast<_UpUInt>(
3755 __k >> (__bits_per_element * (__i / __bits_per_element)));
3756 })
3757 & __generate_vector<_V>([](auto __i) constexpr {
3758 return static_cast<_UpUInt>(1ull
3759 << (__i % __bits_per_element));
3760 }); // mask bit index
3761 return __intrin_bitcast<_UV>(__tmp != _V());
3762 }
3763 }
3764
3765 // }}}
3766 // _S_to_maskvector(_SimdWrapper) {{{
3767 template <typename _Up, size_t _UpN = 0, typename _Tp, size_t _Np,
3768 size_t _ToN = _UpN == 0 ? _Np : _UpN>
3769 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Up, _ToN>
3770 _S_to_maskvector(_SimdWrapper<_Tp, _Np> __x)
3771 {
3772 static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>);
3773 using _TW = _SimdWrapper<_Tp, _Np>;
3774 using _UW = _SimdWrapper<_Up, _ToN>;
3775 using _UI = __intrinsic_type_t<_Up, _ToN>;
3776 if constexpr (is_same_v<_Tp, bool>) // bits -> vector
3777 return _S_to_maskvector<_Up, _ToN>(
3778 _BitMask<_Np>(__x._M_data)._M_sanitized());
3779 // vector -> vector bitcast
3780 else if constexpr (sizeof(_Up) == sizeof(_Tp)
3781 && sizeof(_TW) == sizeof(_UW))
3782 return __wrapper_bitcast<_Up, _ToN>(
3783 _ToN <= _Np
3784 ? __x
3785 : simd_abi::_VecBuiltin<sizeof(_Tp) * _Np>::_S_masked(__x));
3786 else // vector -> vector {{{
3787 {
3788 if (__x._M_is_constprop() || __builtin_is_constant_evaluated())
3789 {
3790 const auto __y = __vector_bitcast<__int_for_sizeof_t<_Tp>>(__x);
3791 return __generate_from_n_evaluations<std::min(_ToN, _Np),
3792 __vector_type_t<_Up, _ToN>>(
3793 [&](auto __i) -> _Up { return __y[__i.value]; });
3794 }
3795 using _To = __vector_type_t<_Up, _ToN>;
3796 [[maybe_unused]] constexpr size_t _FromN = _Np;
3797 constexpr int _FromBytes = sizeof(_Tp);
3798 constexpr int _ToBytes = sizeof(_Up);
3799 const auto __k = __x._M_data;
3800
3801 if constexpr (_FromBytes == _ToBytes)
3802 return __intrin_bitcast<_To>(__k);
3803 else if constexpr (sizeof(_UI) == 16 && sizeof(__k) == 16)
3804 { // SSE -> SSE {{{
3805 if constexpr (_FromBytes == 4 && _ToBytes == 8)
3806 return __intrin_bitcast<_To>(__interleave128_lo(__k, __k));
3807 else if constexpr (_FromBytes == 2 && _ToBytes == 8)
3808 {
3809 const auto __y
3810 = __vector_bitcast<int>(__interleave128_lo(__k, __k));
3811 return __intrin_bitcast<_To>(__interleave128_lo(__y, __y));
3812 }
3813 else if constexpr (_FromBytes == 1 && _ToBytes == 8)
3814 {
3815 auto __y
3816 = __vector_bitcast<short>(__interleave128_lo(__k, __k));
3817 auto __z
3818 = __vector_bitcast<int>(__interleave128_lo(__y, __y));
3819 return __intrin_bitcast<_To>(__interleave128_lo(__z, __z));
3820 }
3821 else if constexpr (_FromBytes == 8 && _ToBytes == 4
3822 && __have_sse2)
3823 return __intrin_bitcast<_To>(
3824 _mm_packs_epi32(__vector_bitcast<_LLong>(__k), __m128i()));
3825 else if constexpr (_FromBytes == 8 && _ToBytes == 4)
3826 return __vector_shuffle<1, 3, 6, 7>(__vector_bitcast<_Up>(__k),
3827 _UI());
3828 else if constexpr (_FromBytes == 2 && _ToBytes == 4)
3829 return __intrin_bitcast<_To>(__interleave128_lo(__k, __k));
3830 else if constexpr (_FromBytes == 1 && _ToBytes == 4)
3831 {
3832 const auto __y
3833 = __vector_bitcast<short>(__interleave128_lo(__k, __k));
3834 return __intrin_bitcast<_To>(__interleave128_lo(__y, __y));
3835 }
3836 else if constexpr (_FromBytes == 8 && _ToBytes == 2)
3837 {
3838 if constexpr (__have_sse2 && !__have_ssse3)
3839 return __intrin_bitcast<_To>(_mm_packs_epi32(
3840 _mm_packs_epi32(__vector_bitcast<_LLong>(__k), __m128i()),
3841 __m128i()));
3842 else
3843 return __intrin_bitcast<_To>(
3844 __vector_permute<3, 7, -1, -1, -1, -1, -1, -1>(
3845 __vector_bitcast<_Up>(__k)));
3846 }
3847 else if constexpr (_FromBytes == 4 && _ToBytes == 2)
3848 return __intrin_bitcast<_To>(
3849 _mm_packs_epi32(__vector_bitcast<_LLong>(__k), __m128i()));
3850 else if constexpr (_FromBytes == 1 && _ToBytes == 2)
3851 return __intrin_bitcast<_To>(__interleave128_lo(__k, __k));
3852 else if constexpr (_FromBytes == 8 && _ToBytes == 1
3853 && __have_ssse3)
3854 return __intrin_bitcast<_To>(
3855 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k),
3856 _mm_setr_epi8(7, 15, -1, -1, -1, -1, -1, -1,
3857 -1, -1, -1, -1, -1, -1, -1,
3858 -1)));
3859 else if constexpr (_FromBytes == 8 && _ToBytes == 1)
3860 {
3861 auto __y
3862 = _mm_packs_epi32(__vector_bitcast<_LLong>(__k), __m128i());
3863 __y = _mm_packs_epi32(__y, __m128i());
3864 return __intrin_bitcast<_To>(_mm_packs_epi16(__y, __m128i()));
3865 }
3866 else if constexpr (_FromBytes == 4 && _ToBytes == 1
3867 && __have_ssse3)
3868 return __intrin_bitcast<_To>(
3869 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k),
3870 _mm_setr_epi8(3, 7, 11, 15, -1, -1, -1, -1,
3871 -1, -1, -1, -1, -1, -1, -1,
3872 -1)));
3873 else if constexpr (_FromBytes == 4 && _ToBytes == 1)
3874 {
3875 const auto __y
3876 = _mm_packs_epi32(__vector_bitcast<_LLong>(__k), __m128i());
3877 return __intrin_bitcast<_To>(_mm_packs_epi16(__y, __m128i()));
3878 }
3879 else if constexpr (_FromBytes == 2 && _ToBytes == 1)
3880 return __intrin_bitcast<_To>(
3881 _mm_packs_epi16(__vector_bitcast<_LLong>(__k), __m128i()));
3882 else
3883 __assert_unreachable<_Tp>();
3884 } // }}}
3885 else if constexpr (sizeof(_UI) == 32 && sizeof(__k) == 32)
3886 { // AVX -> AVX {{{
3887 if constexpr (_FromBytes == _ToBytes)
3888 __assert_unreachable<_Tp>();
3889 else if constexpr (_FromBytes == _ToBytes * 2)
3890 {
3891 const auto __y = __vector_bitcast<_LLong>(__k);
3892 return __intrin_bitcast<_To>(_mm256_castsi128_si256(
3893 _mm_packs_epi16(__lo128(__y), __hi128(__y))));
3894 }
3895 else if constexpr (_FromBytes == _ToBytes * 4)
3896 {
3897 const auto __y = __vector_bitcast<_LLong>(__k);
3898 return __intrin_bitcast<_To>(_mm256_castsi128_si256(
3899 _mm_packs_epi16(_mm_packs_epi16(__lo128(__y), __hi128(__y)),
3900 __m128i())));
3901 }
3902 else if constexpr (_FromBytes == _ToBytes * 8)
3903 {
3904 const auto __y = __vector_bitcast<_LLong>(__k);
3905 return __intrin_bitcast<_To>(
3906 _mm256_castsi128_si256(_mm_shuffle_epi8(
3907 _mm_packs_epi16(__lo128(__y), __hi128(__y)),
3908 _mm_setr_epi8(3, 7, 11, 15, -1, -1, -1, -1, -1, -1, -1,
3909 -1, -1, -1, -1, -1))));
3910 }
3911 else if constexpr (_FromBytes * 2 == _ToBytes)
3912 {
3913 auto __y = __xzyw(__to_intrin(__k));
3914 if constexpr (is_floating_point_v<
3915 _Tp> || (!__have_avx2 && _FromBytes == 4))
3916 {
3917 const auto __yy = __vector_bitcast<float>(__y);
3918 return __intrin_bitcast<_To>(
3919 _mm256_unpacklo_ps(__yy, __yy));
3920 }
3921 else
3922 return __intrin_bitcast<_To>(
3923 _mm256_unpacklo_epi8(__y, __y));
3924 }
3925 else if constexpr (_FromBytes * 4 == _ToBytes)
3926 {
3927 auto __y
3928 = _mm_unpacklo_epi8(__lo128(__vector_bitcast<_LLong>(__k)),
3929 __lo128(__vector_bitcast<_LLong>(
3930 __k))); // drops 3/4 of input
3931 return __intrin_bitcast<_To>(
3932 __concat(_mm_unpacklo_epi16(__y, __y),
3933 _mm_unpackhi_epi16(__y, __y)));
3934 }
3935 else if constexpr (_FromBytes == 1 && _ToBytes == 8)
3936 {
3937 auto __y
3938 = _mm_unpacklo_epi8(__lo128(__vector_bitcast<_LLong>(__k)),
3939 __lo128(__vector_bitcast<_LLong>(
3940 __k))); // drops 3/4 of input
3941 __y
3942 = _mm_unpacklo_epi16(__y,
3943 __y); // drops another 1/2 => 7/8 total
3944 return __intrin_bitcast<_To>(
3945 __concat(_mm_unpacklo_epi32(__y, __y),
3946 _mm_unpackhi_epi32(__y, __y)));
3947 }
3948 else
3949 __assert_unreachable<_Tp>();
3950 } // }}}
3951 else if constexpr (sizeof(_UI) == 32 && sizeof(__k) == 16)
3952 { // SSE -> AVX {{{
3953 if constexpr (_FromBytes == _ToBytes)
3954 return __intrin_bitcast<_To>(
3955 __intrinsic_type_t<_Tp, 32 / sizeof(_Tp)>(
3956 __zero_extend(__to_intrin(__k))));
3957 else if constexpr (_FromBytes * 2 == _ToBytes)
3958 { // keep all
3959 return __intrin_bitcast<_To>(
3960 __concat(_mm_unpacklo_epi8(__vector_bitcast<_LLong>(__k),
3961 __vector_bitcast<_LLong>(__k)),
3962 _mm_unpackhi_epi8(__vector_bitcast<_LLong>(__k),
3963 __vector_bitcast<_LLong>(__k))));
3964 }
3965 else if constexpr (_FromBytes * 4 == _ToBytes)
3966 {
3967 if constexpr (__have_avx2)
3968 {
3969 return __intrin_bitcast<_To>(_mm256_shuffle_epi8(
3970 __concat(__vector_bitcast<_LLong>(__k),
3971 __vector_bitcast<_LLong>(__k)),
3972 _mm256_setr_epi8(0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3,
3973 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6,
3974 6, 6, 7, 7, 7, 7)));
3975 }
3976 else
3977 {
3978 return __intrin_bitcast<_To>(__concat(
3979 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k),
3980 _mm_setr_epi8(0, 0, 0, 0, 1, 1, 1, 1,
3981 2, 2, 2, 2, 3, 3, 3, 3)),
3982 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k),
3983 _mm_setr_epi8(4, 4, 4, 4, 5, 5, 5, 5,
3984 6, 6, 6, 6, 7, 7, 7,
3985 7))));
3986 }
3987 }
3988 else if constexpr (_FromBytes * 8 == _ToBytes)
3989 {
3990 if constexpr (__have_avx2)
3991 {
3992 return __intrin_bitcast<_To>(_mm256_shuffle_epi8(
3993 __concat(__vector_bitcast<_LLong>(__k),
3994 __vector_bitcast<_LLong>(__k)),
3995 _mm256_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
3996 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3,
3997 3, 3, 3, 3, 3, 3)));
3998 }
3999 else
4000 {
4001 return __intrin_bitcast<_To>(__concat(
4002 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k),
4003 _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0,
4004 1, 1, 1, 1, 1, 1, 1, 1)),
4005 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k),
4006 _mm_setr_epi8(2, 2, 2, 2, 2, 2, 2, 2,
4007 3, 3, 3, 3, 3, 3, 3,
4008 3))));
4009 }
4010 }
4011 else if constexpr (_FromBytes == _ToBytes * 2)
4012 return __intrin_bitcast<_To>(__m256i(__zero_extend(
4013 _mm_packs_epi16(__vector_bitcast<_LLong>(__k), __m128i()))));
4014 else if constexpr (_FromBytes == 8 && _ToBytes == 2)
4015 {
4016 return __intrin_bitcast<_To>(__m256i(__zero_extend(
4017 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k),
4018 _mm_setr_epi8(6, 7, 14, 15, -1, -1, -1, -1,
4019 -1, -1, -1, -1, -1, -1, -1,
4020 -1)))));
4021 }
4022 else if constexpr (_FromBytes == 4 && _ToBytes == 1)
4023 {
4024 return __intrin_bitcast<_To>(__m256i(__zero_extend(
4025 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k),
4026 _mm_setr_epi8(3, 7, 11, 15, -1, -1, -1, -1,
4027 -1, -1, -1, -1, -1, -1, -1,
4028 -1)))));
4029 }
4030 else if constexpr (_FromBytes == 8 && _ToBytes == 1)
4031 {
4032 return __intrin_bitcast<_To>(__m256i(__zero_extend(
4033 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k),
4034 _mm_setr_epi8(7, 15, -1, -1, -1, -1, -1,
4035 -1, -1, -1, -1, -1, -1, -1,
4036 -1, -1)))));
4037 }
4038 else
4039 static_assert(!is_same_v<_Tp, _Tp>, "should be unreachable");
4040 } // }}}
4041 else if constexpr (sizeof(_UI) == 16 && sizeof(__k) == 32)
4042 { // AVX -> SSE {{{
4043 if constexpr (_FromBytes == _ToBytes)
4044 { // keep low 1/2
4045 return __intrin_bitcast<_To>(__lo128(__k));
4046 }
4047 else if constexpr (_FromBytes == _ToBytes * 2)
4048 { // keep all
4049 auto __y = __vector_bitcast<_LLong>(__k);
4050 return __intrin_bitcast<_To>(
4051 _mm_packs_epi16(__lo128(__y), __hi128(__y)));
4052 }
4053 else if constexpr (_FromBytes == _ToBytes * 4)
4054 { // add 1/2 undef
4055 auto __y = __vector_bitcast<_LLong>(__k);
4056 return __intrin_bitcast<_To>(
4057 _mm_packs_epi16(_mm_packs_epi16(__lo128(__y), __hi128(__y)),
4058 __m128i()));
4059 }
4060 else if constexpr (_FromBytes == 8 && _ToBytes == 1)
4061 { // add 3/4 undef
4062 auto __y = __vector_bitcast<_LLong>(__k);
4063 return __intrin_bitcast<_To>(_mm_shuffle_epi8(
4064 _mm_packs_epi16(__lo128(__y), __hi128(__y)),
4065 _mm_setr_epi8(3, 7, 11, 15, -1, -1, -1, -1, -1, -1, -1, -1,
4066 -1, -1, -1, -1)));
4067 }
4068 else if constexpr (_FromBytes * 2 == _ToBytes)
4069 { // keep low 1/4
4070 auto __y = __lo128(__vector_bitcast<_LLong>(__k));
4071 return __intrin_bitcast<_To>(_mm_unpacklo_epi8(__y, __y));
4072 }
4073 else if constexpr (_FromBytes * 4 == _ToBytes)
4074 { // keep low 1/8
4075 auto __y = __lo128(__vector_bitcast<_LLong>(__k));
4076 __y = _mm_unpacklo_epi8(__y, __y);
4077 return __intrin_bitcast<_To>(_mm_unpacklo_epi8(__y, __y));
4078 }
4079 else if constexpr (_FromBytes * 8 == _ToBytes)
4080 { // keep low 1/16
4081 auto __y = __lo128(__vector_bitcast<_LLong>(__k));
4082 __y = _mm_unpacklo_epi8(__y, __y);
4083 __y = _mm_unpacklo_epi8(__y, __y);
4084 return __intrin_bitcast<_To>(_mm_unpacklo_epi8(__y, __y));
4085 }
4086 else
4087 static_assert(!is_same_v<_Tp, _Tp>, "should be unreachable");
4088 } // }}}
4089 else
4090 return _Base::template _S_to_maskvector<_Up, _ToN>(__x);
4091 /*
4092 if constexpr (_FromBytes > _ToBytes) {
4093 const _To __y = __vector_bitcast<_Up>(__k);
4094 return [&] <size_t... _Is> (index_sequence<_Is...>) {
4095 constexpr int _Stride = _FromBytes / _ToBytes;
4096 return _To{__y[(_Is + 1) * _Stride - 1]...};
4097 }(make_index_sequence<std::min(_ToN, _FromN)>());
4098 } else {
4099 // {0, 0, 1, 1} (_Dups = 2, _Is<4>)
4100 // {0, 0, 0, 0, 1, 1, 1, 1} (_Dups = 4, _Is<8>)
4101 // {0, 0, 1, 1, 2, 2, 3, 3} (_Dups = 2, _Is<8>)
4102 // ...
4103 return [&] <size_t... _Is> (index_sequence<_Is...>) {
4104 constexpr int __dup = _ToBytes / _FromBytes;
4105 return __intrin_bitcast<_To>(_From{__k[_Is / __dup]...});
4106 }(make_index_sequence<_FromN>());
4107 }
4108 */
4109 } // }}}
4110 }
4111
4112 // }}}
4113 // _S_to_bits {{{
4114 template <typename _Tp, size_t _Np>
4115 _GLIBCXX_SIMD_INTRINSIC static constexpr _SanitizedBitMask<_Np>
4116 _S_to_bits(_SimdWrapper<_Tp, _Np> __x)
4117 {
4118 if constexpr (is_same_v<_Tp, bool>)
4119 return _BitMask<_Np>(__x._M_data)._M_sanitized();
4120 else
4121 {
4122 static_assert(is_same_v<_Tp, __int_for_sizeof_t<_Tp>>);
4123 if (__builtin_is_constant_evaluated()
4124 || __builtin_constant_p(__x._M_data))
4125 {
4126 const auto __bools = -__x._M_data;
4127 const _ULLong __k = __call_with_n_evaluations<_Np>(
4128 [](auto... __bits) { return (__bits | ...); },
4129 [&](auto __i) { return _ULLong(__bools[+__i]) << __i; });
4130 if (__builtin_is_constant_evaluated()
4131 || __builtin_constant_p(__k))
4132 return __k;
4133 }
4134 const auto __xi = __to_intrin(__x);
4135 if constexpr (sizeof(_Tp) == 1)
4136 if constexpr (sizeof(__xi) == 16)
4137 if constexpr (__have_avx512bw_vl)
4138 return _BitMask<_Np>(_mm_movepi8_mask(__xi));
4139 else // implies SSE2
4140 return _BitMask<_Np>(_mm_movemask_epi8(__xi));
4141 else if constexpr (sizeof(__xi) == 32)
4142 if constexpr (__have_avx512bw_vl)
4143 return _BitMask<_Np>(_mm256_movepi8_mask(__xi));
4144 else // implies AVX2
4145 return _BitMask<_Np>(_mm256_movemask_epi8(__xi));
4146 else // implies AVX512BW
4147 return _BitMask<_Np>(_mm512_movepi8_mask(__xi));
4148
4149 else if constexpr (sizeof(_Tp) == 2)
4150 if constexpr (sizeof(__xi) == 16)
4151 if constexpr (__have_avx512bw_vl)
4152 return _BitMask<_Np>(_mm_movepi16_mask(__xi));
4153 else if constexpr (__have_avx512bw)
4154 return _BitMask<_Np>(_mm512_movepi16_mask(__zero_extend(__xi)));
4155 else // implies SSE2
4156 return _BitMask<_Np>(
4157 _mm_movemask_epi8(_mm_packs_epi16(__xi, __m128i())));
4158 else if constexpr (sizeof(__xi) == 32)
4159 if constexpr (__have_avx512bw_vl)
4160 return _BitMask<_Np>(_mm256_movepi16_mask(__xi));
4161 else if constexpr (__have_avx512bw)
4162 return _BitMask<_Np>(_mm512_movepi16_mask(__zero_extend(__xi)));
4163 else // implies SSE2
4164 return _BitMask<_Np>(_mm_movemask_epi8(
4165 _mm_packs_epi16(__lo128(__xi), __hi128(__xi))));
4166 else // implies AVX512BW
4167 return _BitMask<_Np>(_mm512_movepi16_mask(__xi));
4168
4169 else if constexpr (sizeof(_Tp) == 4)
4170 if constexpr (sizeof(__xi) == 16)
4171 if constexpr (__have_avx512dq_vl)
4172 return _BitMask<_Np>(_mm_movepi32_mask(__xi));
4173 else if constexpr (__have_avx512vl)
4174 return _BitMask<_Np>(_mm_cmplt_epi32_mask(__xi, __m128i()));
4175 else if constexpr (__have_avx512dq)
4176 return _BitMask<_Np>(_mm512_movepi32_mask(__zero_extend(__xi)));
4177 else if constexpr (__have_avx512f)
4178 return _BitMask<_Np>(
4179 _mm512_cmplt_epi32_mask(__zero_extend(__xi), __m512i()));
4180 else // implies SSE
4181 return _BitMask<_Np>(
4182 _mm_movemask_ps(reinterpret_cast<__m128>(__xi)));
4183 else if constexpr (sizeof(__xi) == 32)
4184 if constexpr (__have_avx512dq_vl)
4185 return _BitMask<_Np>(_mm256_movepi32_mask(__xi));
4186 else if constexpr (__have_avx512dq)
4187 return _BitMask<_Np>(_mm512_movepi32_mask(__zero_extend(__xi)));
4188 else if constexpr (__have_avx512vl)
4189 return _BitMask<_Np>(_mm256_cmplt_epi32_mask(__xi, __m256i()));
4190 else if constexpr (__have_avx512f)
4191 return _BitMask<_Np>(
4192 _mm512_cmplt_epi32_mask(__zero_extend(__xi), __m512i()));
4193 else // implies AVX
4194 return _BitMask<_Np>(
4195 _mm256_movemask_ps(reinterpret_cast<__m256>(__xi)));
4196 else // implies AVX512??
4197 if constexpr (__have_avx512dq)
4198 return _BitMask<_Np>(_mm512_movepi32_mask(__xi));
4199 else // implies AVX512F
4200 return _BitMask<_Np>(_mm512_cmplt_epi32_mask(__xi, __m512i()));
4201
4202 else if constexpr (sizeof(_Tp) == 8)
4203 if constexpr (sizeof(__xi) == 16)
4204 if constexpr (__have_avx512dq_vl)
4205 return _BitMask<_Np>(_mm_movepi64_mask(__xi));
4206 else if constexpr (__have_avx512dq)
4207 return _BitMask<_Np>(_mm512_movepi64_mask(__zero_extend(__xi)));
4208 else if constexpr (__have_avx512vl)
4209 return _BitMask<_Np>(_mm_cmplt_epi64_mask(__xi, __m128i()));
4210 else if constexpr (__have_avx512f)
4211 return _BitMask<_Np>(
4212 _mm512_cmplt_epi64_mask(__zero_extend(__xi), __m512i()));
4213 else // implies SSE2
4214 return _BitMask<_Np>(
4215 _mm_movemask_pd(reinterpret_cast<__m128d>(__xi)));
4216 else if constexpr (sizeof(__xi) == 32)
4217 if constexpr (__have_avx512dq_vl)
4218 return _BitMask<_Np>(_mm256_movepi64_mask(__xi));
4219 else if constexpr (__have_avx512dq)
4220 return _BitMask<_Np>(_mm512_movepi64_mask(__zero_extend(__xi)));
4221 else if constexpr (__have_avx512vl)
4222 return _BitMask<_Np>(_mm256_cmplt_epi64_mask(__xi, __m256i()));
4223 else if constexpr (__have_avx512f)
4224 return _BitMask<_Np>(
4225 _mm512_cmplt_epi64_mask(__zero_extend(__xi), __m512i()));
4226 else // implies AVX
4227 return _BitMask<_Np>(
4228 _mm256_movemask_pd(reinterpret_cast<__m256d>(__xi)));
4229 else // implies AVX512??
4230 if constexpr (__have_avx512dq)
4231 return _BitMask<_Np>(_mm512_movepi64_mask(__xi));
4232 else // implies AVX512F
4233 return _BitMask<_Np>(_mm512_cmplt_epi64_mask(__xi, __m512i()));
4234
4235 else
4236 __assert_unreachable<_Tp>();
4237 }
4238 }
4239 // }}}
4240};
4241
4242// }}}
4243// _MaskImplX86 {{{
4244template <typename _Abi, typename>
4245 struct _MaskImplX86 : _MaskImplX86Mixin, _MaskImplBuiltin<_Abi>
4246 {
4247 using _MaskImplX86Mixin::_S_to_bits;
4248 using _MaskImplX86Mixin::_S_to_maskvector;
4249 using _MaskImplBuiltin<_Abi>::_S_convert;
4250
4251 // member types {{{
4252 template <typename _Tp>
4253 using _SimdMember = typename _Abi::template __traits<_Tp>::_SimdMember;
4254
4255 template <typename _Tp>
4256 using _MaskMember = typename _Abi::template _MaskMember<_Tp>;
4257
4258 template <typename _Tp>
4259 static constexpr size_t _S_size = simd_size_v<_Tp, _Abi>;
4260
4261 using _Base = _MaskImplBuiltin<_Abi>;
4262
4263 // }}}
4264 // _S_broadcast {{{
4265 template <typename _Tp>
4266 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
4267 _S_broadcast(bool __x)
4268 {
4269 if constexpr (__is_avx512_abi<_Abi>())
4270 return __x ? _Abi::_S_masked(_MaskMember<_Tp>(-1))
4271 : _MaskMember<_Tp>();
4272 else
4273 return _Base::template _S_broadcast<_Tp>(__x);
4274 }
4275
4276 // }}}
4277 // _S_load {{{
4278 template <typename _Tp>
4279 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
4280 _S_load(const bool* __mem)
4281 {
4282 static_assert(is_same_v<_Tp, __int_for_sizeof_t<_Tp>>);
4283 if constexpr (__have_avx512bw)
4284 {
4285 const auto __to_vec_or_bits = [](auto __bits) -> decltype(auto) {
4286 if constexpr (__is_avx512_abi<_Abi>())
4287 return __bits;
4288 else
4289 return _S_to_maskvector<_Tp>(
4290 _BitMask<_S_size<_Tp>>(__bits)._M_sanitized());
4291 };
4292
4293 if constexpr (_S_size<_Tp> <= 16 && __have_avx512vl)
4294 {
4295 __m128i __a = {};
4296 __builtin_memcpy(&__a, __mem, _S_size<_Tp>);
4297 return __to_vec_or_bits(_mm_test_epi8_mask(__a, __a));
4298 }
4299 else if constexpr (_S_size<_Tp> <= 32 && __have_avx512vl)
4300 {
4301 __m256i __a = {};
4302 __builtin_memcpy(&__a, __mem, _S_size<_Tp>);
4303 return __to_vec_or_bits(_mm256_test_epi8_mask(__a, __a));
4304 }
4305 else if constexpr (_S_size<_Tp> <= 64)
4306 {
4307 __m512i __a = {};
4308 __builtin_memcpy(&__a, __mem, _S_size<_Tp>);
4309 return __to_vec_or_bits(_mm512_test_epi8_mask(__a, __a));
4310 }
4311 }
4312 else if constexpr (__is_avx512_abi<_Abi>())
4313 {
4314 if constexpr (_S_size<_Tp> <= 8)
4315 {
4316 __m128i __a = {};
4317 __builtin_memcpy(&__a, __mem, _S_size<_Tp>);
4318 const auto __b = _mm512_cvtepi8_epi64(__a);
4319 return _mm512_test_epi64_mask(__b, __b);
4320 }
4321 else if constexpr (_S_size<_Tp> <= 16)
4322 {
4323 __m128i __a = {};
4324 __builtin_memcpy(&__a, __mem, _S_size<_Tp>);
4325 const auto __b = _mm512_cvtepi8_epi32(__a);
4326 return _mm512_test_epi32_mask(__b, __b);
4327 }
4328 else if constexpr (_S_size<_Tp> <= 32)
4329 {
4330 __m128i __a = {};
4331 __builtin_memcpy(&__a, __mem, 16);
4332 const auto __b = _mm512_cvtepi8_epi32(__a);
4333 __builtin_memcpy(&__a, __mem + 16, _S_size<_Tp> - 16);
4334 const auto __c = _mm512_cvtepi8_epi32(__a);
4335 return _mm512_test_epi32_mask(__b, __b)
4336 | (_mm512_test_epi32_mask(__c, __c) << 16);
4337 }
4338 else if constexpr (_S_size<_Tp> <= 64)
4339 {
4340 __m128i __a = {};
4341 __builtin_memcpy(&__a, __mem, 16);
4342 const auto __b = _mm512_cvtepi8_epi32(__a);
4343 __builtin_memcpy(&__a, __mem + 16, 16);
4344 const auto __c = _mm512_cvtepi8_epi32(__a);
4345 if constexpr (_S_size<_Tp> <= 48)
4346 {
4347 __builtin_memcpy(&__a, __mem + 32, _S_size<_Tp> - 32);
4348 const auto __d = _mm512_cvtepi8_epi32(__a);
4349 return _mm512_test_epi32_mask(__b, __b)
4350 | (_mm512_test_epi32_mask(__c, __c) << 16)
4351 | (_ULLong(_mm512_test_epi32_mask(__d, __d)) << 32);
4352 }
4353 else
4354 {
4355 __builtin_memcpy(&__a, __mem + 16, 16);
4356 const auto __d = _mm512_cvtepi8_epi32(__a);
4357 __builtin_memcpy(&__a, __mem + 32, _S_size<_Tp> - 48);
4358 const auto __e = _mm512_cvtepi8_epi32(__a);
4359 return _mm512_test_epi32_mask(__b, __b)
4360 | (_mm512_test_epi32_mask(__c, __c) << 16)
4361 | (_ULLong(_mm512_test_epi32_mask(__d, __d)) << 32)
4362 | (_ULLong(_mm512_test_epi32_mask(__e, __e)) << 48);
4363 }
4364 }
4365 else
4366 __assert_unreachable<_Tp>();
4367 }
4368 else if constexpr (sizeof(_Tp) == 8 && _S_size<_Tp> == 2)
4369 return __vector_bitcast<_Tp>(
4370 __vector_type16_t<int>{-int(__mem[0]), -int(__mem[0]),
4371 -int(__mem[1]), -int(__mem[1])});
4372 else if constexpr (sizeof(_Tp) == 8 && _S_size<_Tp> <= 4 && __have_avx)
4373 {
4374 int __bool4 = 0;
4375 __builtin_memcpy(&__bool4, __mem, _S_size<_Tp>);
4376 const auto __k = __to_intrin(
4377 (__vector_broadcast<4>(__bool4)
4378 & __make_vector<int>(0x1, 0x100, 0x10000,
4379 _S_size<_Tp> == 4 ? 0x1000000 : 0))
4380 != 0);
4381 return __vector_bitcast<_Tp>(
4382 __concat(_mm_unpacklo_epi32(__k, __k),
4383 _mm_unpackhi_epi32(__k, __k)));
4384 }
4385 else if constexpr (sizeof(_Tp) == 4 && _S_size<_Tp> <= 4)
4386 {
4387 int __bools = 0;
4388 __builtin_memcpy(&__bools, __mem, _S_size<_Tp>);
4389 if constexpr (__have_sse2)
4390 {
4391 __m128i __k = _mm_cvtsi32_si128(__bools);
4392 __k = _mm_cmpgt_epi16(_mm_unpacklo_epi8(__k, __k), __m128i());
4393 return __vector_bitcast<_Tp, _S_size<_Tp>>(
4394 _mm_unpacklo_epi16(__k, __k));
4395 }
4396 else
4397 {
4398 __m128 __k = _mm_cvtpi8_ps(_mm_cvtsi32_si64(__bools));
4399 _mm_empty();
4400 return __vector_bitcast<_Tp, _S_size<_Tp>>(
4401 _mm_cmpgt_ps(__k, __m128()));
4402 }
4403 }
4404 else if constexpr (sizeof(_Tp) == 4 && _S_size<_Tp> <= 8)
4405 {
4406 __m128i __k = {};
4407 __builtin_memcpy(&__k, __mem, _S_size<_Tp>);
4408 __k = _mm_cmpgt_epi16(_mm_unpacklo_epi8(__k, __k), __m128i());
4409 return __vector_bitcast<_Tp>(
4410 __concat(_mm_unpacklo_epi16(__k, __k),
4411 _mm_unpackhi_epi16(__k, __k)));
4412 }
4413 else if constexpr (sizeof(_Tp) == 2 && _S_size<_Tp> <= 16)
4414 {
4415 __m128i __k = {};
4416 __builtin_memcpy(&__k, __mem, _S_size<_Tp>);
4417 __k = _mm_cmpgt_epi8(__k, __m128i());
4418 if constexpr (_S_size<_Tp> <= 8)
4419 return __vector_bitcast<_Tp, _S_size<_Tp>>(
4420 _mm_unpacklo_epi8(__k, __k));
4421 else
4422 return __concat(_mm_unpacklo_epi8(__k, __k),
4423 _mm_unpackhi_epi8(__k, __k));
4424 }
4425 else
4426 return _Base::template _S_load<_Tp>(__mem);
4427 }
4428
4429 // }}}
4430 // _S_from_bitmask{{{
4431 template <size_t _Np, typename _Tp>
4432 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
4433 _S_from_bitmask(_SanitizedBitMask<_Np> __bits, _TypeTag<_Tp>)
4434 {
4435 static_assert(is_same_v<_Tp, __int_for_sizeof_t<_Tp>>);
4436 if constexpr (__is_avx512_abi<_Abi>())
4437 return __bits._M_to_bits();
4438 else
4439 return _S_to_maskvector<_Tp, _S_size<_Tp>>(__bits);
4440 }
4441
4442 // }}}
4443 // _S_masked_load {{{2
4444 template <typename _Tp, size_t _Np>
4445 static inline _SimdWrapper<_Tp, _Np>
4446 _S_masked_load(_SimdWrapper<_Tp, _Np> __merge,
4447 _SimdWrapper<_Tp, _Np> __mask, const bool* __mem) noexcept
4448 {
4449 if constexpr (__is_avx512_abi<_Abi>())
4450 {
4451 if constexpr (__have_avx512bw_vl)
4452 {
4453 if constexpr (_Np <= 16)
4454 {
4455 const auto __a
4456 = _mm_mask_loadu_epi8(__m128i(), __mask, __mem);
4457 return (__merge & ~__mask) | _mm_test_epi8_mask(__a, __a);
4458 }
4459 else if constexpr (_Np <= 32)
4460 {
4461 const auto __a
4462 = _mm256_mask_loadu_epi8(__m256i(), __mask, __mem);
4463 return (__merge & ~__mask)
4464 | _mm256_test_epi8_mask(__a, __a);
4465 }
4466 else if constexpr (_Np <= 64)
4467 {
4468 const auto __a
4469 = _mm512_mask_loadu_epi8(__m512i(), __mask, __mem);
4470 return (__merge & ~__mask)
4471 | _mm512_test_epi8_mask(__a, __a);
4472 }
4473 else
4474 __assert_unreachable<_Tp>();
4475 }
4476 else
4477 {
4478 _BitOps::_S_bit_iteration(__mask, [&](auto __i) {
4479 __merge._M_set(__i, __mem[__i]);
4480 });
4481 return __merge;
4482 }
4483 }
4484 else if constexpr (__have_avx512bw_vl && _Np == 32 && sizeof(_Tp) == 1)
4485 {
4486 const auto __k = _S_to_bits(__mask)._M_to_bits();
4487 __merge = _mm256_mask_sub_epi8(__to_intrin(__merge), __k, __m256i(),
4488 _mm256_mask_loadu_epi8(__m256i(),
4489 __k, __mem));
4490 }
4491 else if constexpr (__have_avx512bw_vl && _Np == 16 && sizeof(_Tp) == 1)
4492 {
4493 const auto __k = _S_to_bits(__mask)._M_to_bits();
4494 __merge
4495 = _mm_mask_sub_epi8(__vector_bitcast<_LLong>(__merge), __k,
4496 __m128i(),
4497 _mm_mask_loadu_epi8(__m128i(), __k, __mem));
4498 }
4499 else if constexpr (__have_avx512bw_vl && _Np == 16 && sizeof(_Tp) == 2)
4500 {
4501 const auto __k = _S_to_bits(__mask)._M_to_bits();
4502 __merge = _mm256_mask_sub_epi16(
4503 __vector_bitcast<_LLong>(__merge), __k, __m256i(),
4504 _mm256_cvtepi8_epi16(_mm_mask_loadu_epi8(__m128i(), __k, __mem)));
4505 }
4506 else if constexpr (__have_avx512bw_vl && _Np == 8 && sizeof(_Tp) == 2)
4507 {
4508 const auto __k = _S_to_bits(__mask)._M_to_bits();
4509 __merge = _mm_mask_sub_epi16(
4510 __vector_bitcast<_LLong>(__merge), __k, __m128i(),
4511 _mm_cvtepi8_epi16(_mm_mask_loadu_epi8(__m128i(), __k, __mem)));
4512 }
4513 else if constexpr (__have_avx512bw_vl && _Np == 8 && sizeof(_Tp) == 4)
4514 {
4515 const auto __k = _S_to_bits(__mask)._M_to_bits();
4516 __merge = __vector_bitcast<_Tp>(_mm256_mask_sub_epi32(
4517 __vector_bitcast<_LLong>(__merge), __k, __m256i(),
4518 _mm256_cvtepi8_epi32(
4519 _mm_mask_loadu_epi8(__m128i(), __k, __mem))));
4520 }
4521 else if constexpr (__have_avx512bw_vl && _Np == 4 && sizeof(_Tp) == 4)
4522 {
4523 const auto __k = _S_to_bits(__mask)._M_to_bits();
4524 __merge = __vector_bitcast<_Tp>(_mm_mask_sub_epi32(
4525 __vector_bitcast<_LLong>(__merge), __k, __m128i(),
4526 _mm_cvtepi8_epi32(_mm_mask_loadu_epi8(__m128i(), __k, __mem))));
4527 }
4528 else if constexpr (__have_avx512bw_vl && _Np == 4 && sizeof(_Tp) == 8)
4529 {
4530 const auto __k = _S_to_bits(__mask)._M_to_bits();
4531 __merge = __vector_bitcast<_Tp>(_mm256_mask_sub_epi64(
4532 __vector_bitcast<_LLong>(__merge), __k, __m256i(),
4533 _mm256_cvtepi8_epi64(
4534 _mm_mask_loadu_epi8(__m128i(), __k, __mem))));
4535 }
4536 else if constexpr (__have_avx512bw_vl && _Np == 2 && sizeof(_Tp) == 8)
4537 {
4538 const auto __k = _S_to_bits(__mask)._M_to_bits();
4539 __merge = __vector_bitcast<_Tp>(_mm_mask_sub_epi64(
4540 __vector_bitcast<_LLong>(__merge), __k, __m128i(),
4541 _mm_cvtepi8_epi64(_mm_mask_loadu_epi8(__m128i(), __k, __mem))));
4542 }
4543 else
4544 return _Base::_S_masked_load(__merge, __mask, __mem);
4545 return __merge;
4546 }
4547
4548 // _S_store {{{2
4549 template <typename _Tp, size_t _Np>
4550 _GLIBCXX_SIMD_INTRINSIC static void _S_store(_SimdWrapper<_Tp, _Np> __v,
4551 bool* __mem) noexcept
4552 {
4553 if constexpr (__is_avx512_abi<_Abi>())
4554 {
4555 if constexpr (__have_avx512bw_vl)
4556 _CommonImplX86::_S_store<_Np>(
4557 __vector_bitcast<char>([](auto __data) {
4558 if constexpr (_Np <= 16)
4559 return _mm_maskz_set1_epi8(__data, 1);
4560 else if constexpr (_Np <= 32)
4561 return _mm256_maskz_set1_epi8(__data, 1);
4562 else
4563 return _mm512_maskz_set1_epi8(__data, 1);
4564 }(__v._M_data)),
4565 __mem);
4566 else if constexpr (_Np <= 8)
4567 _CommonImplX86::_S_store<_Np>(
4568 __vector_bitcast<char>(
4569#if defined __x86_64__
4570 __make_wrapper<_ULLong>(
4571 _pdep_u64(__v._M_data, 0x0101010101010101ULL), 0ull)
4572#else
4573 __make_wrapper<_UInt>(_pdep_u32(__v._M_data, 0x01010101U),
4574 _pdep_u32(__v._M_data >> 4,
4575 0x01010101U))
4576#endif
4577 ),
4578 __mem);
4579 else if constexpr (_Np <= 16)
4580 _mm512_mask_cvtepi32_storeu_epi8(
4581 __mem, 0xffffu >> (16 - _Np),
4582 _mm512_maskz_set1_epi32(__v._M_data, 1));
4583 else
4584 __assert_unreachable<_Tp>();
4585 }
4586 else if constexpr (__is_sse_abi<_Abi>()) //{{{
4587 {
4588 if constexpr (_Np == 2 && sizeof(_Tp) == 8)
4589 {
4590 const auto __k = __vector_bitcast<int>(__v);
4591 __mem[0] = -__k[1];
4592 __mem[1] = -__k[3];
4593 }
4594 else if constexpr (_Np <= 4 && sizeof(_Tp) == 4)
4595 {
4596 if constexpr (__have_sse2)
4597 {
4598 const unsigned __bool4
4599 = __vector_bitcast<_UInt>(_mm_packs_epi16(
4600 _mm_packs_epi32(__intrin_bitcast<__m128i>(
4601 __to_intrin(__v)),
4602 __m128i()),
4603 __m128i()))[0]
4604 & 0x01010101u;
4605 __builtin_memcpy(__mem, &__bool4, _Np);
4606 }
4607 else if constexpr (__have_mmx)
4608 {
4609 const __m64 __k = _mm_cvtps_pi8(
4610 __and(__to_intrin(__v), _mm_set1_ps(1.f)));
4611 __builtin_memcpy(__mem, &__k, _Np);
4612 _mm_empty();
4613 }
4614 else
4615 return _Base::_S_store(__v, __mem);
4616 }
4617 else if constexpr (_Np <= 8 && sizeof(_Tp) == 2)
4618 {
4619 _CommonImplX86::_S_store<_Np>(
4620 __vector_bitcast<char>(_mm_packs_epi16(
4621 __to_intrin(__vector_bitcast<_UShort>(__v) >> 15),
4622 __m128i())),
4623 __mem);
4624 }
4625 else if constexpr (_Np <= 16 && sizeof(_Tp) == 1)
4626 _CommonImplX86::_S_store<_Np>(__v._M_data & 1, __mem);
4627 else
4628 __assert_unreachable<_Tp>();
4629 } // }}}
4630 else if constexpr (__is_avx_abi<_Abi>()) // {{{
4631 {
4632 if constexpr (_Np <= 4 && sizeof(_Tp) == 8)
4633 {
4634 auto __k = __intrin_bitcast<__m256i>(__to_intrin(__v));
4635 int __bool4;
4636 if constexpr (__have_avx2)
4637 __bool4 = _mm256_movemask_epi8(__k);
4638 else
4639 __bool4 = (_mm_movemask_epi8(__lo128(__k))
4640 | (_mm_movemask_epi8(__hi128(__k)) << 16));
4641 __bool4 &= 0x01010101;
4642 __builtin_memcpy(__mem, &__bool4, _Np);
4643 }
4644 else if constexpr (_Np <= 8 && sizeof(_Tp) == 4)
4645 {
4646 const auto __k = __intrin_bitcast<__m256i>(__to_intrin(__v));
4647 const auto __k2
4648 = _mm_srli_epi16(_mm_packs_epi16(__lo128(__k), __hi128(__k)),
4649 15);
4650 const auto __k3
4651 = __vector_bitcast<char>(_mm_packs_epi16(__k2, __m128i()));
4652 _CommonImplX86::_S_store<_Np>(__k3, __mem);
4653 }
4654 else if constexpr (_Np <= 16 && sizeof(_Tp) == 2)
4655 {
4656 if constexpr (__have_avx2)
4657 {
4658 const auto __x = _mm256_srli_epi16(__to_intrin(__v), 15);
4659 const auto __bools = __vector_bitcast<char>(
4660 _mm_packs_epi16(__lo128(__x), __hi128(__x)));
4661 _CommonImplX86::_S_store<_Np>(__bools, __mem);
4662 }
4663 else
4664 {
4665 const auto __bools
4666 = 1
4667 & __vector_bitcast<_UChar>(
4668 _mm_packs_epi16(__lo128(__to_intrin(__v)),
4669 __hi128(__to_intrin(__v))));
4670 _CommonImplX86::_S_store<_Np>(__bools, __mem);
4671 }
4672 }
4673 else if constexpr (_Np <= 32 && sizeof(_Tp) == 1)
4674 _CommonImplX86::_S_store<_Np>(1 & __v._M_data, __mem);
4675 else
4676 __assert_unreachable<_Tp>();
4677 } // }}}
4678 else
4679 __assert_unreachable<_Tp>();
4680 }
4681
4682 // _S_masked_store {{{2
4683 template <typename _Tp, size_t _Np>
4684 static inline void
4685 _S_masked_store(const _SimdWrapper<_Tp, _Np> __v, bool* __mem,
4686 const _SimdWrapper<_Tp, _Np> __k) noexcept
4687 {
4688 if constexpr (__is_avx512_abi<_Abi>())
4689 {
4690 static_assert(is_same_v<_Tp, bool>);
4691 if constexpr (_Np <= 16 && __have_avx512bw_vl)
4692 _mm_mask_storeu_epi8(__mem, __k, _mm_maskz_set1_epi8(__v, 1));
4693 else if constexpr (_Np <= 16)
4694 _mm512_mask_cvtepi32_storeu_epi8(__mem, __k,
4695 _mm512_maskz_set1_epi32(__v, 1));
4696 else if constexpr (_Np <= 32 && __have_avx512bw_vl)
4697 _mm256_mask_storeu_epi8(__mem, __k,
4698 _mm256_maskz_set1_epi8(__v, 1));
4699 else if constexpr (_Np <= 32 && __have_avx512bw)
4700 _mm256_mask_storeu_epi8(__mem, __k,
4701 __lo256(_mm512_maskz_set1_epi8(__v, 1)));
4702 else if constexpr (_Np <= 64 && __have_avx512bw)
4703 _mm512_mask_storeu_epi8(__mem, __k,
4704 _mm512_maskz_set1_epi8(__v, 1));
4705 else
4706 __assert_unreachable<_Tp>();
4707 }
4708 else
4709 _Base::_S_masked_store(__v, __mem, __k);
4710 }
4711
4712 // logical and bitwise operators {{{2
4713 template <typename _Tp, size_t _Np>
4714 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
4715 _S_logical_and(const _SimdWrapper<_Tp, _Np>& __x,
4716 const _SimdWrapper<_Tp, _Np>& __y)
4717 {
4718 if constexpr (is_same_v<_Tp, bool>)
4719 {
4720 if constexpr (__have_avx512dq && _Np <= 8)
4721 return _kand_mask8(__x._M_data, __y._M_data);
4722 else if constexpr (_Np <= 16)
4723 return _kand_mask16(__x._M_data, __y._M_data);
4724 else if constexpr (__have_avx512bw && _Np <= 32)
4725 return _kand_mask32(__x._M_data, __y._M_data);
4726 else if constexpr (__have_avx512bw && _Np <= 64)
4727 return _kand_mask64(__x._M_data, __y._M_data);
4728 else
4729 __assert_unreachable<_Tp>();
4730 }
4731 else
4732 return _Base::_S_logical_and(__x, __y);
4733 }
4734
4735 template <typename _Tp, size_t _Np>
4736 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
4737 _S_logical_or(const _SimdWrapper<_Tp, _Np>& __x,
4738 const _SimdWrapper<_Tp, _Np>& __y)
4739 {
4740 if constexpr (is_same_v<_Tp, bool>)
4741 {
4742 if constexpr (__have_avx512dq && _Np <= 8)
4743 return _kor_mask8(__x._M_data, __y._M_data);
4744 else if constexpr (_Np <= 16)
4745 return _kor_mask16(__x._M_data, __y._M_data);
4746 else if constexpr (__have_avx512bw && _Np <= 32)
4747 return _kor_mask32(__x._M_data, __y._M_data);
4748 else if constexpr (__have_avx512bw && _Np <= 64)
4749 return _kor_mask64(__x._M_data, __y._M_data);
4750 else
4751 __assert_unreachable<_Tp>();
4752 }
4753 else
4754 return _Base::_S_logical_or(__x, __y);
4755 }
4756
4757 template <typename _Tp, size_t _Np>
4758 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
4759 _S_bit_not(const _SimdWrapper<_Tp, _Np>& __x)
4760 {
4761 if constexpr (is_same_v<_Tp, bool>)
4762 {
4763 if constexpr (__have_avx512dq && _Np <= 8)
4764 return _kandn_mask8(__x._M_data,
4765 _Abi::template __implicit_mask_n<_Np>());
4766 else if constexpr (_Np <= 16)
4767 return _kandn_mask16(__x._M_data,
4768 _Abi::template __implicit_mask_n<_Np>());
4769 else if constexpr (__have_avx512bw && _Np <= 32)
4770 return _kandn_mask32(__x._M_data,
4771 _Abi::template __implicit_mask_n<_Np>());
4772 else if constexpr (__have_avx512bw && _Np <= 64)
4773 return _kandn_mask64(__x._M_data,
4774 _Abi::template __implicit_mask_n<_Np>());
4775 else
4776 __assert_unreachable<_Tp>();
4777 }
4778 else
4779 return _Base::_S_bit_not(__x);
4780 }
4781
4782 template <typename _Tp, size_t _Np>
4783 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
4784 _S_bit_and(const _SimdWrapper<_Tp, _Np>& __x,
4785 const _SimdWrapper<_Tp, _Np>& __y)
4786 {
4787 if constexpr (is_same_v<_Tp, bool>)
4788 {
4789 if constexpr (__have_avx512dq && _Np <= 8)
4790 return _kand_mask8(__x._M_data, __y._M_data);
4791 else if constexpr (_Np <= 16)
4792 return _kand_mask16(__x._M_data, __y._M_data);
4793 else if constexpr (__have_avx512bw && _Np <= 32)
4794 return _kand_mask32(__x._M_data, __y._M_data);
4795 else if constexpr (__have_avx512bw && _Np <= 64)
4796 return _kand_mask64(__x._M_data, __y._M_data);
4797 else
4798 __assert_unreachable<_Tp>();
4799 }
4800 else
4801 return _Base::_S_bit_and(__x, __y);
4802 }
4803
4804 template <typename _Tp, size_t _Np>
4805 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
4806 _S_bit_or(const _SimdWrapper<_Tp, _Np>& __x,
4807 const _SimdWrapper<_Tp, _Np>& __y)
4808 {
4809 if constexpr (is_same_v<_Tp, bool>)
4810 {
4811 if constexpr (__have_avx512dq && _Np <= 8)
4812 return _kor_mask8(__x._M_data, __y._M_data);
4813 else if constexpr (_Np <= 16)
4814 return _kor_mask16(__x._M_data, __y._M_data);
4815 else if constexpr (__have_avx512bw && _Np <= 32)
4816 return _kor_mask32(__x._M_data, __y._M_data);
4817 else if constexpr (__have_avx512bw && _Np <= 64)
4818 return _kor_mask64(__x._M_data, __y._M_data);
4819 else
4820 __assert_unreachable<_Tp>();
4821 }
4822 else
4823 return _Base::_S_bit_or(__x, __y);
4824 }
4825
4826 template <typename _Tp, size_t _Np>
4827 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
4828 _S_bit_xor(const _SimdWrapper<_Tp, _Np>& __x,
4829 const _SimdWrapper<_Tp, _Np>& __y)
4830 {
4831 if constexpr (is_same_v<_Tp, bool>)
4832 {
4833 if constexpr (__have_avx512dq && _Np <= 8)
4834 return _kxor_mask8(__x._M_data, __y._M_data);
4835 else if constexpr (_Np <= 16)
4836 return _kxor_mask16(__x._M_data, __y._M_data);
4837 else if constexpr (__have_avx512bw && _Np <= 32)
4838 return _kxor_mask32(__x._M_data, __y._M_data);
4839 else if constexpr (__have_avx512bw && _Np <= 64)
4840 return _kxor_mask64(__x._M_data, __y._M_data);
4841 else
4842 __assert_unreachable<_Tp>();
4843 }
4844 else
4845 return _Base::_S_bit_xor(__x, __y);
4846 }
4847
4848 //}}}2
4849 // _S_masked_assign{{{
4850 template <size_t _Np>
4851 _GLIBCXX_SIMD_INTRINSIC static void
4852 _S_masked_assign(_SimdWrapper<bool, _Np> __k,
4853 _SimdWrapper<bool, _Np>& __lhs,
4854 _SimdWrapper<bool, _Np> __rhs)
4855 {
4856 __lhs._M_data
4857 = (~__k._M_data & __lhs._M_data) | (__k._M_data & __rhs._M_data);
4858 }
4859
4860 template <size_t _Np>
4861 _GLIBCXX_SIMD_INTRINSIC static void
4862 _S_masked_assign(_SimdWrapper<bool, _Np> __k,
4863 _SimdWrapper<bool, _Np>& __lhs, bool __rhs)
4864 {
4865 if (__rhs)
4866 __lhs._M_data = __k._M_data | __lhs._M_data;
4867 else
4868 __lhs._M_data = ~__k._M_data & __lhs._M_data;
4869 }
4870
4871 using _MaskImplBuiltin<_Abi>::_S_masked_assign;
4872
4873 //}}}
4874 // _S_all_of {{{
4875 template <typename _Tp>
4876 _GLIBCXX_SIMD_INTRINSIC static bool _S_all_of(simd_mask<_Tp, _Abi> __k)
4877 {
4878 if constexpr (__is_sse_abi<_Abi>() || __is_avx_abi<_Abi>())
4879 {
4880 constexpr size_t _Np = simd_size_v<_Tp, _Abi>;
4881 using _TI = __intrinsic_type_t<_Tp, _Np>;
4882 const _TI __a = reinterpret_cast<_TI>(__to_intrin(__data(__k)));
4883 if constexpr (__have_sse4_1)
4884 {
4885 _GLIBCXX_SIMD_USE_CONSTEXPR _TI __b
4886 = _Abi::template _S_implicit_mask_intrin<_Tp>();
4887 return 0 != __testc(__a, __b);
4888 }
4889 else if constexpr (is_same_v<_Tp, float>)
4890 return (_mm_movemask_ps(__a) & ((1 << _Np) - 1))
4891 == (1 << _Np) - 1;
4892 else if constexpr (is_same_v<_Tp, double>)
4893 return (_mm_movemask_pd(__a) & ((1 << _Np) - 1))
4894 == (1 << _Np) - 1;
4895 else
4896 return (_mm_movemask_epi8(__a) & ((1 << (_Np * sizeof(_Tp))) - 1))
4897 == (1 << (_Np * sizeof(_Tp))) - 1;
4898 }
4899 else if constexpr (__is_avx512_abi<_Abi>())
4900 {
4901 constexpr auto _Mask = _Abi::template _S_implicit_mask<_Tp>();
4902 const auto __kk = __k._M_data._M_data;
4903 if constexpr (sizeof(__kk) == 1)
4904 {
4905 if constexpr (__have_avx512dq)
4906 return _kortestc_mask8_u8(__kk, _Mask == 0xff
4907 ? __kk
4908 : __mmask8(~_Mask));
4909 else
4910 return _kortestc_mask16_u8(__kk, __mmask16(~_Mask));
4911 }
4912 else if constexpr (sizeof(__kk) == 2)
4913 return _kortestc_mask16_u8(__kk, _Mask == 0xffff
4914 ? __kk
4915 : __mmask16(~_Mask));
4916 else if constexpr (sizeof(__kk) == 4 && __have_avx512bw)
4917 return _kortestc_mask32_u8(__kk, _Mask == 0xffffffffU
4918 ? __kk
4919 : __mmask32(~_Mask));
4920 else if constexpr (sizeof(__kk) == 8 && __have_avx512bw)
4921 return _kortestc_mask64_u8(__kk, _Mask == 0xffffffffffffffffULL
4922 ? __kk
4923 : __mmask64(~_Mask));
4924 else
4925 __assert_unreachable<_Tp>();
4926 }
4927 }
4928
4929 // }}}
4930 // _S_any_of {{{
4931 template <typename _Tp>
4932 _GLIBCXX_SIMD_INTRINSIC static bool _S_any_of(simd_mask<_Tp, _Abi> __k)
4933 {
4934 if constexpr (__is_sse_abi<_Abi>() || __is_avx_abi<_Abi>())
4935 {
4936 constexpr size_t _Np = simd_size_v<_Tp, _Abi>;
4937 using _TI = __intrinsic_type_t<_Tp, _Np>;
4938 const _TI __a = reinterpret_cast<_TI>(__to_intrin(__data(__k)));
4939 if constexpr (__have_sse4_1)
4940 {
4941 if constexpr (_Abi::template _S_is_partial<
4942 _Tp> || sizeof(__k) < 16)
4943 {
4944 _GLIBCXX_SIMD_USE_CONSTEXPR _TI __b
4945 = _Abi::template _S_implicit_mask_intrin<_Tp>();
4946 return 0 == __testz(__a, __b);
4947 }
4948 else
4949 return 0 == __testz(__a, __a);
4950 }
4951 else if constexpr (is_same_v<_Tp, float>)
4952 return (_mm_movemask_ps(__a) & ((1 << _Np) - 1)) != 0;
4953 else if constexpr (is_same_v<_Tp, double>)
4954 return (_mm_movemask_pd(__a) & ((1 << _Np) - 1)) != 0;
4955 else
4956 return (_mm_movemask_epi8(__a) & ((1 << (_Np * sizeof(_Tp))) - 1))
4957 != 0;
4958 }
4959 else if constexpr (__is_avx512_abi<_Abi>())
4960 return (__k._M_data._M_data & _Abi::template _S_implicit_mask<_Tp>())
4961 != 0;
4962 }
4963
4964 // }}}
4965 // _S_none_of {{{
4966 template <typename _Tp>
4967 _GLIBCXX_SIMD_INTRINSIC static bool _S_none_of(simd_mask<_Tp, _Abi> __k)
4968 {
4969 if constexpr (__is_sse_abi<_Abi>() || __is_avx_abi<_Abi>())
4970 {
4971 constexpr size_t _Np = simd_size_v<_Tp, _Abi>;
4972 using _TI = __intrinsic_type_t<_Tp, _Np>;
4973 const _TI __a = reinterpret_cast<_TI>(__to_intrin(__data(__k)));
4974 if constexpr (__have_sse4_1)
4975 {
4976 if constexpr (_Abi::template _S_is_partial<
4977 _Tp> || sizeof(__k) < 16)
4978 {
4979 _GLIBCXX_SIMD_USE_CONSTEXPR _TI __b
4980 = _Abi::template _S_implicit_mask_intrin<_Tp>();
4981 return 0 != __testz(__a, __b);
4982 }
4983 else
4984 return 0 != __testz(__a, __a);
4985 }
4986 else if constexpr (is_same_v<_Tp, float>)
4987 return (__movemask(__a) & ((1 << _Np) - 1)) == 0;
4988 else if constexpr (is_same_v<_Tp, double>)
4989 return (__movemask(__a) & ((1 << _Np) - 1)) == 0;
4990 else
4991 return (__movemask(__a) & int((1ull << (_Np * sizeof(_Tp))) - 1))
4992 == 0;
4993 }
4994 else if constexpr (__is_avx512_abi<_Abi>())
4995 return (__k._M_data._M_data & _Abi::template _S_implicit_mask<_Tp>())
4996 == 0;
4997 }
4998
4999 // }}}
5000 // _S_some_of {{{
5001 template <typename _Tp>
5002 _GLIBCXX_SIMD_INTRINSIC static bool _S_some_of(simd_mask<_Tp, _Abi> __k)
5003 {
5004 if constexpr (__is_sse_abi<_Abi>() || __is_avx_abi<_Abi>())
5005 {
5006 constexpr size_t _Np = simd_size_v<_Tp, _Abi>;
5007 using _TI = __intrinsic_type_t<_Tp, _Np>;
5008 const _TI __a = reinterpret_cast<_TI>(__to_intrin(__data(__k)));
5009 if constexpr (__have_sse4_1)
5010 {
5011 _GLIBCXX_SIMD_USE_CONSTEXPR _TI __b
5012 = _Abi::template _S_implicit_mask_intrin<_Tp>();
5013 return 0 != __testnzc(__a, __b);
5014 }
5015 else if constexpr (is_same_v<_Tp, float>)
5016 {
5017 constexpr int __allbits = (1 << _Np) - 1;
5018 const auto __tmp = _mm_movemask_ps(__a) & __allbits;
5019 return __tmp > 0 && __tmp < __allbits;
5020 }
5021 else if constexpr (is_same_v<_Tp, double>)
5022 {
5023 constexpr int __allbits = (1 << _Np) - 1;
5024 const auto __tmp = _mm_movemask_pd(__a) & __allbits;
5025 return __tmp > 0 && __tmp < __allbits;
5026 }
5027 else
5028 {
5029 constexpr int __allbits = (1 << (_Np * sizeof(_Tp))) - 1;
5030 const auto __tmp = _mm_movemask_epi8(__a) & __allbits;
5031 return __tmp > 0 && __tmp < __allbits;
5032 }
5033 }
5034 else if constexpr (__is_avx512_abi<_Abi>())
5035 return _S_any_of(__k) && !_S_all_of(__k);
5036 else
5037 __assert_unreachable<_Tp>();
5038 }
5039
5040 // }}}
5041 // _S_popcount {{{
5042 template <typename _Tp>
5043 _GLIBCXX_SIMD_INTRINSIC static int _S_popcount(simd_mask<_Tp, _Abi> __k)
5044 {
5045 constexpr size_t _Np = simd_size_v<_Tp, _Abi>;
5046 const auto __kk = _Abi::_S_masked(__k._M_data)._M_data;
5047 if constexpr (__is_avx512_abi<_Abi>())
5048 {
5049 if constexpr (_Np > 32)
5050 return __builtin_popcountll(__kk);
5051 else
5052 return __builtin_popcount(__kk);
5053 }
5054 else
5055 {
5056 if constexpr (__have_popcnt)
5057 {
5058 int __bits
5059 = __movemask(__to_intrin(__vector_bitcast<_Tp>(__kk)));
5060 const int __count = __builtin_popcount(__bits);
5061 return is_integral_v<_Tp> ? __count / sizeof(_Tp) : __count;
5062 }
5063 else if constexpr (_Np == 2 && sizeof(_Tp) == 8)
5064 {
5065 const int mask = _mm_movemask_pd(__auto_bitcast(__kk));
5066 return mask - (mask >> 1);
5067 }
5068 else if constexpr (_Np <= 4 && sizeof(_Tp) == 8)
5069 {
5070 auto __x = -(__lo128(__kk) + __hi128(__kk));
5071 return __x[0] + __x[1];
5072 }
5073 else if constexpr (_Np <= 4 && sizeof(_Tp) == 4)
5074 {
5075 if constexpr (__have_sse2)
5076 {
5077 __m128i __x = __intrin_bitcast<__m128i>(__to_intrin(__kk));
5078 __x = _mm_add_epi32(
5079 __x, _mm_shuffle_epi32(__x, _MM_SHUFFLE(0, 1, 2, 3)));
5080 __x = _mm_add_epi32(
5081 __x, _mm_shufflelo_epi16(__x, _MM_SHUFFLE(1, 0, 3, 2)));
5082 return -_mm_cvtsi128_si32(__x);
5083 }
5084 else
5085 return __builtin_popcount(
5086 _mm_movemask_ps(__auto_bitcast(__kk)));
5087 }
5088 else if constexpr (_Np <= 8 && sizeof(_Tp) == 2)
5089 {
5090 auto __x = __to_intrin(__kk);
5091 __x = _mm_add_epi16(__x,
5092 _mm_shuffle_epi32(__x,
5093 _MM_SHUFFLE(0, 1, 2, 3)));
5094 __x = _mm_add_epi16(
5095 __x, _mm_shufflelo_epi16(__x, _MM_SHUFFLE(0, 1, 2, 3)));
5096 __x = _mm_add_epi16(
5097 __x, _mm_shufflelo_epi16(__x, _MM_SHUFFLE(2, 3, 0, 1)));
5098 return -short(_mm_extract_epi16(__x, 0));
5099 }
5100 else if constexpr (_Np <= 16 && sizeof(_Tp) == 1)
5101 {
5102 auto __x = __to_intrin(__kk);
5103 __x = _mm_add_epi8(__x,
5104 _mm_shuffle_epi32(__x,
5105 _MM_SHUFFLE(0, 1, 2, 3)));
5106 __x = _mm_add_epi8(__x,
5107 _mm_shufflelo_epi16(__x, _MM_SHUFFLE(0, 1, 2,
5108 3)));
5109 __x = _mm_add_epi8(__x,
5110 _mm_shufflelo_epi16(__x, _MM_SHUFFLE(2, 3, 0,
5111 1)));
5112 auto __y = -__vector_bitcast<_UChar>(__x);
5113 if constexpr (__have_sse4_1)
5114 return __y[0] + __y[1];
5115 else
5116 {
5117 unsigned __z = _mm_extract_epi16(__to_intrin(__y), 0);
5118 return (__z & 0xff) + (__z >> 8);
5119 }
5120 }
5121 else if constexpr (sizeof(__kk) == 32)
5122 {
5123 // The following works only as long as the implementations above
5124 // use a summation
5125 using _I = __int_for_sizeof_t<_Tp>;
5126 const auto __as_int = __vector_bitcast<_I>(__kk);
5127 _MaskImplX86<simd_abi::__sse>::_S_popcount(
5128 simd_mask<_I, simd_abi::__sse>(__private_init,
5129 __lo128(__as_int)
5130 + __hi128(__as_int)));
5131 }
5132 else
5133 __assert_unreachable<_Tp>();
5134 }
5135 }
5136
5137 // }}}
5138 // _S_find_first_set {{{
5139 template <typename _Tp>
5140 _GLIBCXX_SIMD_INTRINSIC static int
5141 _S_find_first_set(simd_mask<_Tp, _Abi> __k)
5142 {
5143 if constexpr (__is_avx512_abi<_Abi>())
5144 return std::__countr_zero(__k._M_data._M_data);
5145 else
5146 return _Base::_S_find_first_set(__k);
5147 }
5148
5149 // }}}
5150 // _S_find_last_set {{{
5151 template <typename _Tp>
5152 _GLIBCXX_SIMD_INTRINSIC static int
5153 _S_find_last_set(simd_mask<_Tp, _Abi> __k)
5154 {
5155 if constexpr (__is_avx512_abi<_Abi>())
5156 return std::__bit_width(__k._M_data._M_data) - 1;
5157 else
5158 return _Base::_S_find_last_set(__k);
5159 }
5160
5161 // }}}
5162 };
5163
5164// }}}
5165
5166_GLIBCXX_SIMD_END_NAMESPACE
5167#endif // __cplusplus >= 201703L
5168#endif // _GLIBCXX_EXPERIMENTAL_SIMD_X86_H_
5169
5170// vim: foldmethod=marker sw=2 noet ts=8 sts=2 tw=80
typename conditional< _Cond, _Iftrue, _Iffalse >::type conditional_t
Alias template for conditional.
Definition: type_traits:2552
typename enable_if< _Cond, _Tp >::type enable_if_t
Alias template for enable_if.
Definition: type_traits:2548
constexpr const _Tp & min(const _Tp &, const _Tp &)
This does what you think it does.
Definition: stl_algobase.h:230