// Definition of the public simd interfaces -*- C++ -*- // Copyright (C) 2020-2022 Free Software Foundation, Inc. // // This file is part of the GNU ISO C++ Library. This library is free // software; you can redistribute it and/or modify it under the // terms of the GNU General Public License as published by the // Free Software Foundation; either version 3, or (at your option) // any later version. // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // Under Section 7 of GPL version 3, you are granted additional // permissions described in the GCC Runtime Library Exception, version // 3.1, as published by the Free Software Foundation. // You should have received a copy of the GNU General Public License and // a copy of the GCC Runtime Library Exception along with this program; // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see // . #ifndef _GLIBCXX_EXPERIMENTAL_SIMD_H #define _GLIBCXX_EXPERIMENTAL_SIMD_H #if __cplusplus >= 201703L #include "simd_detail.h" #include "numeric_traits.h" #include #include #ifdef _GLIBCXX_DEBUG_UB #include // for stderr #endif #include #include #include #include #include #if _GLIBCXX_SIMD_X86INTRIN #include #elif _GLIBCXX_SIMD_HAVE_NEON #pragma GCC diagnostic push // narrowing conversion of '__a' from 'uint64_t' {aka 'long long unsigned int'} to // 'int64x1_t' {aka 'long long int'} [-Wnarrowing] #pragma GCC diagnostic ignored "-Wnarrowing" #include #pragma GCC diagnostic pop #endif /** @ingroup ts_simd * @{ */ /* There are several closely related types, with the following naming * convention: * _Tp: vectorizable (arithmetic) type (or any type) * _TV: __vector_type_t<_Tp, _Np> * _TW: _SimdWrapper<_Tp, _Np> * _TI: __intrinsic_type_t<_Tp, _Np> * _TVT: _VectorTraits<_TV> or _VectorTraits<_TW> * If one additional type is needed use _U instead of _T. * Otherwise use _T\d, _TV\d, _TW\d, TI\d, _TVT\d. * * More naming conventions: * _Ap or _Abi: An ABI tag from the simd_abi namespace * _Ip: often used for integer types with sizeof(_Ip) == sizeof(_Tp), * _IV, _IW as for _TV, _TW * _Np: number of elements (not bytes) * _Bytes: number of bytes * * Variable names: * __k: mask object (vector- or bitmask) */ _GLIBCXX_SIMD_BEGIN_NAMESPACE #if !_GLIBCXX_SIMD_X86INTRIN using __m128 [[__gnu__::__vector_size__(16)]] = float; using __m128d [[__gnu__::__vector_size__(16)]] = double; using __m128i [[__gnu__::__vector_size__(16)]] = long long; using __m256 [[__gnu__::__vector_size__(32)]] = float; using __m256d [[__gnu__::__vector_size__(32)]] = double; using __m256i [[__gnu__::__vector_size__(32)]] = long long; using __m512 [[__gnu__::__vector_size__(64)]] = float; using __m512d [[__gnu__::__vector_size__(64)]] = double; using __m512i [[__gnu__::__vector_size__(64)]] = long long; #endif namespace simd_abi { // simd_abi forward declarations {{{ // implementation details: struct _Scalar; template struct _Fixed; // There are two major ABIs that appear on different architectures. // Both have non-boolean values packed into an N Byte register // -> #elements = N / sizeof(T) // Masks differ: // 1. Use value vector registers for masks (all 0 or all 1) // 2. Use bitmasks (mask registers) with one bit per value in the corresponding // value vector // // Both can be partially used, masking off the rest when doing horizontal // operations or operations that can trap (e.g. FP_INVALID or integer division // by 0). This is encoded as the number of used bytes. template struct _VecBuiltin; template struct _VecBltnBtmsk; template using _VecN = _VecBuiltin; template using _Sse = _VecBuiltin<_UsedBytes>; template using _Avx = _VecBuiltin<_UsedBytes>; template using _Avx512 = _VecBltnBtmsk<_UsedBytes>; template using _Neon = _VecBuiltin<_UsedBytes>; // implementation-defined: using __sse = _Sse<>; using __avx = _Avx<>; using __avx512 = _Avx512<>; using __neon = _Neon<>; using __neon128 = _Neon<16>; using __neon64 = _Neon<8>; // standard: template struct deduce; template using fixed_size = _Fixed<_Np>; using scalar = _Scalar; // }}} } // namespace simd_abi // forward declarations is_simd(_mask), simd(_mask), simd_size {{{ template struct is_simd; template struct is_simd_mask; template class simd; template class simd_mask; template struct simd_size; // }}} // load/store flags {{{ struct element_aligned_tag { template static constexpr size_t _S_alignment = alignof(_Up); template _GLIBCXX_SIMD_INTRINSIC static constexpr _Up* _S_apply(_Up* __ptr) { return __ptr; } }; struct vector_aligned_tag { template static constexpr size_t _S_alignment = std::__bit_ceil(sizeof(_Up) * _Tp::size()); template _GLIBCXX_SIMD_INTRINSIC static constexpr _Up* _S_apply(_Up* __ptr) { return static_cast<_Up*>(__builtin_assume_aligned(__ptr, _S_alignment<_Tp, _Up>)); } }; template struct overaligned_tag { template static constexpr size_t _S_alignment = _Np; template _GLIBCXX_SIMD_INTRINSIC static constexpr _Up* _S_apply(_Up* __ptr) { return static_cast<_Up*>(__builtin_assume_aligned(__ptr, _Np)); } }; inline constexpr element_aligned_tag element_aligned = {}; inline constexpr vector_aligned_tag vector_aligned = {}; template inline constexpr overaligned_tag<_Np> overaligned = {}; // }}} template using _SizeConstant = integral_constant; // constexpr feature detection{{{ constexpr inline bool __have_mmx = _GLIBCXX_SIMD_HAVE_MMX; constexpr inline bool __have_sse = _GLIBCXX_SIMD_HAVE_SSE; constexpr inline bool __have_sse2 = _GLIBCXX_SIMD_HAVE_SSE2; constexpr inline bool __have_sse3 = _GLIBCXX_SIMD_HAVE_SSE3; constexpr inline bool __have_ssse3 = _GLIBCXX_SIMD_HAVE_SSSE3; constexpr inline bool __have_sse4_1 = _GLIBCXX_SIMD_HAVE_SSE4_1; constexpr inline bool __have_sse4_2 = _GLIBCXX_SIMD_HAVE_SSE4_2; constexpr inline bool __have_xop = _GLIBCXX_SIMD_HAVE_XOP; constexpr inline bool __have_avx = _GLIBCXX_SIMD_HAVE_AVX; constexpr inline bool __have_avx2 = _GLIBCXX_SIMD_HAVE_AVX2; constexpr inline bool __have_bmi = _GLIBCXX_SIMD_HAVE_BMI1; constexpr inline bool __have_bmi2 = _GLIBCXX_SIMD_HAVE_BMI2; constexpr inline bool __have_lzcnt = _GLIBCXX_SIMD_HAVE_LZCNT; constexpr inline bool __have_sse4a = _GLIBCXX_SIMD_HAVE_SSE4A; constexpr inline bool __have_fma = _GLIBCXX_SIMD_HAVE_FMA; constexpr inline bool __have_fma4 = _GLIBCXX_SIMD_HAVE_FMA4; constexpr inline bool __have_f16c = _GLIBCXX_SIMD_HAVE_F16C; constexpr inline bool __have_popcnt = _GLIBCXX_SIMD_HAVE_POPCNT; constexpr inline bool __have_avx512f = _GLIBCXX_SIMD_HAVE_AVX512F; constexpr inline bool __have_avx512dq = _GLIBCXX_SIMD_HAVE_AVX512DQ; constexpr inline bool __have_avx512vl = _GLIBCXX_SIMD_HAVE_AVX512VL; constexpr inline bool __have_avx512bw = _GLIBCXX_SIMD_HAVE_AVX512BW; constexpr inline bool __have_avx512dq_vl = __have_avx512dq && __have_avx512vl; constexpr inline bool __have_avx512bw_vl = __have_avx512bw && __have_avx512vl; constexpr inline bool __have_avx512bitalg = _GLIBCXX_SIMD_HAVE_AVX512BITALG; constexpr inline bool __have_avx512vbmi2 = _GLIBCXX_SIMD_HAVE_AVX512VBMI2; constexpr inline bool __have_avx512vbmi = _GLIBCXX_SIMD_HAVE_AVX512VBMI; constexpr inline bool __have_avx512ifma = _GLIBCXX_SIMD_HAVE_AVX512IFMA; constexpr inline bool __have_avx512cd = _GLIBCXX_SIMD_HAVE_AVX512CD; constexpr inline bool __have_avx512vnni = _GLIBCXX_SIMD_HAVE_AVX512VNNI; constexpr inline bool __have_avx512vpopcntdq = _GLIBCXX_SIMD_HAVE_AVX512VPOPCNTDQ; constexpr inline bool __have_avx512vp2intersect = _GLIBCXX_SIMD_HAVE_AVX512VP2INTERSECT; constexpr inline bool __have_neon = _GLIBCXX_SIMD_HAVE_NEON; constexpr inline bool __have_neon_a32 = _GLIBCXX_SIMD_HAVE_NEON_A32; constexpr inline bool __have_neon_a64 = _GLIBCXX_SIMD_HAVE_NEON_A64; constexpr inline bool __support_neon_float = #if defined __GCC_IEC_559 __GCC_IEC_559 == 0; #elif defined __FAST_MATH__ true; #else false; #endif #ifdef _ARCH_PWR10 constexpr inline bool __have_power10vec = true; #else constexpr inline bool __have_power10vec = false; #endif #ifdef __POWER9_VECTOR__ constexpr inline bool __have_power9vec = true; #else constexpr inline bool __have_power9vec = false; #endif #if defined __POWER8_VECTOR__ constexpr inline bool __have_power8vec = true; #else constexpr inline bool __have_power8vec = __have_power9vec; #endif #if defined __VSX__ constexpr inline bool __have_power_vsx = true; #else constexpr inline bool __have_power_vsx = __have_power8vec; #endif #if defined __ALTIVEC__ constexpr inline bool __have_power_vmx = true; #else constexpr inline bool __have_power_vmx = __have_power_vsx; #endif // }}} namespace __detail { #ifdef math_errhandling // Determines _S_handle_fpexcept from math_errhandling if it is defined and expands to a constant // expression. math_errhandling may expand to an extern symbol, in which case a constexpr value // must be guessed. template constexpr bool __handle_fpexcept_impl(int) { return math_errhandling & MATH_ERREXCEPT; } #endif // Fallback if math_errhandling doesn't work: with fast-math assume floating-point exceptions are // ignored, otherwise implement correct exception behavior. constexpr bool __handle_fpexcept_impl(float) { #if defined __FAST_MATH__ return false; #else return true; #endif } /// True if math functions must raise floating-point exceptions as specified by C17. static constexpr bool _S_handle_fpexcept = __handle_fpexcept_impl(0); constexpr std::uint_least64_t __floating_point_flags() { std::uint_least64_t __flags = 0; if constexpr (_S_handle_fpexcept) __flags |= 1; #ifdef __FAST_MATH__ __flags |= 1 << 1; #elif __FINITE_MATH_ONLY__ __flags |= 2 << 1; #elif __GCC_IEC_559 < 2 __flags |= 3 << 1; #endif __flags |= (__FLT_EVAL_METHOD__ + 1) << 3; return __flags; } constexpr std::uint_least64_t __machine_flags() { if constexpr (__have_mmx || __have_sse) return __have_mmx | (__have_sse << 1) | (__have_sse2 << 2) | (__have_sse3 << 3) | (__have_ssse3 << 4) | (__have_sse4_1 << 5) | (__have_sse4_2 << 6) | (__have_xop << 7) | (__have_avx << 8) | (__have_avx2 << 9) | (__have_bmi << 10) | (__have_bmi2 << 11) | (__have_lzcnt << 12) | (__have_sse4a << 13) | (__have_fma << 14) | (__have_fma4 << 15) | (__have_f16c << 16) | (__have_popcnt << 17) | (__have_avx512f << 18) | (__have_avx512dq << 19) | (__have_avx512vl << 20) | (__have_avx512bw << 21) | (__have_avx512bitalg << 22) | (__have_avx512vbmi2 << 23) | (__have_avx512vbmi << 24) | (__have_avx512ifma << 25) | (__have_avx512cd << 26) | (__have_avx512vnni << 27) | (__have_avx512vpopcntdq << 28) | (__have_avx512vp2intersect << 29); else if constexpr (__have_neon) return __have_neon | (__have_neon_a32 << 1) | (__have_neon_a64 << 2) | (__have_neon_a64 << 2) | (__support_neon_float << 3); else if constexpr (__have_power_vmx) return __have_power_vmx | (__have_power_vsx << 1) | (__have_power8vec << 2) | (__have_power9vec << 3) | (__have_power10vec << 4); else return 0; } namespace { struct _OdrEnforcer {}; } template struct _MachineFlagsTemplate {}; /**@internal * Use this type as default template argument to all function templates that * are not declared always_inline. It ensures, that a function * specialization, which the compiler decides not to inline, has a unique symbol * (_OdrEnforcer) or a symbol matching the machine/architecture flags * (_MachineFlagsTemplate). This helps to avoid ODR violations in cases where * users link TUs compiled with different flags. This is especially important * for using simd in libraries. */ using __odr_helper = conditional_t<__machine_flags() == 0, _OdrEnforcer, _MachineFlagsTemplate<__machine_flags(), __floating_point_flags()>>; struct _Minimum { template _GLIBCXX_SIMD_INTRINSIC constexpr _Tp operator()(_Tp __a, _Tp __b) const { using std::min; return min(__a, __b); } }; struct _Maximum { template _GLIBCXX_SIMD_INTRINSIC constexpr _Tp operator()(_Tp __a, _Tp __b) const { using std::max; return max(__a, __b); } }; } // namespace __detail // unrolled/pack execution helpers // __execute_n_times{{{ template [[__gnu__::__flatten__]] _GLIBCXX_SIMD_INTRINSIC constexpr void __execute_on_index_sequence(_Fp&& __f, index_sequence<_I...>) { ((void)__f(_SizeConstant<_I>()), ...); } template _GLIBCXX_SIMD_INTRINSIC constexpr void __execute_on_index_sequence(_Fp&&, index_sequence<>) { } template _GLIBCXX_SIMD_INTRINSIC constexpr void __execute_n_times(_Fp&& __f) { __execute_on_index_sequence(static_cast<_Fp&&>(__f), make_index_sequence<_Np>{}); } // }}} // __generate_from_n_evaluations{{{ template [[__gnu__::__flatten__]] _GLIBCXX_SIMD_INTRINSIC constexpr _R __execute_on_index_sequence_with_return(_Fp&& __f, index_sequence<_I...>) { return _R{__f(_SizeConstant<_I>())...}; } template _GLIBCXX_SIMD_INTRINSIC constexpr _R __generate_from_n_evaluations(_Fp&& __f) { return __execute_on_index_sequence_with_return<_R>( static_cast<_Fp&&>(__f), make_index_sequence<_Np>{}); } // }}} // __call_with_n_evaluations{{{ template [[__gnu__::__flatten__]] _GLIBCXX_SIMD_INTRINSIC constexpr auto __call_with_n_evaluations(index_sequence<_I...>, _F0&& __f0, _FArgs&& __fargs) { return __f0(__fargs(_SizeConstant<_I>())...); } template _GLIBCXX_SIMD_INTRINSIC constexpr auto __call_with_n_evaluations(_F0&& __f0, _FArgs&& __fargs) { return __call_with_n_evaluations(make_index_sequence<_Np>{}, static_cast<_F0&&>(__f0), static_cast<_FArgs&&>(__fargs)); } // }}} // __call_with_subscripts{{{ template [[__gnu__::__flatten__]] _GLIBCXX_SIMD_INTRINSIC constexpr auto __call_with_subscripts(_Tp&& __x, index_sequence<_It...>, _Fp&& __fun) { return __fun(__x[_First + _It]...); } template _GLIBCXX_SIMD_INTRINSIC constexpr auto __call_with_subscripts(_Tp&& __x, _Fp&& __fun) { return __call_with_subscripts<_First>(static_cast<_Tp&&>(__x), make_index_sequence<_Np>(), static_cast<_Fp&&>(__fun)); } // }}} // vvv ---- type traits ---- vvv // integer type aliases{{{ using _UChar = unsigned char; using _SChar = signed char; using _UShort = unsigned short; using _UInt = unsigned int; using _ULong = unsigned long; using _ULLong = unsigned long long; using _LLong = long long; //}}} // __first_of_pack{{{ template struct __first_of_pack { using type = _T0; }; template using __first_of_pack_t = typename __first_of_pack<_Ts...>::type; //}}} // __value_type_or_identity_t {{{ template typename _Tp::value_type __value_type_or_identity_impl(int); template _Tp __value_type_or_identity_impl(float); template using __value_type_or_identity_t = decltype(__value_type_or_identity_impl<_Tp>(int())); // }}} // __is_vectorizable {{{ template struct __is_vectorizable : public is_arithmetic<_Tp> {}; template <> struct __is_vectorizable : public false_type {}; template inline constexpr bool __is_vectorizable_v = __is_vectorizable<_Tp>::value; // Deduces to a vectorizable type template >> using _Vectorizable = _Tp; // }}} // _LoadStorePtr / __is_possible_loadstore_conversion {{{ template struct __is_possible_loadstore_conversion : conjunction<__is_vectorizable<_Ptr>, __is_vectorizable<_ValueType>> {}; template <> struct __is_possible_loadstore_conversion : true_type {}; // Deduces to a type allowed for load/store with the given value type. template ::value>> using _LoadStorePtr = _Ptr; // }}} // __is_bitmask{{{ template > struct __is_bitmask : false_type {}; template inline constexpr bool __is_bitmask_v = __is_bitmask<_Tp>::value; // the __mmaskXX case: template struct __is_bitmask<_Tp, void_t() = declval<_Tp>() & 1u)>> : true_type {}; // }}} // __int_for_sizeof{{{ #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wpedantic" template constexpr auto __int_for_sizeof() { static_assert(_Bytes > 0); if constexpr (_Bytes == sizeof(int)) return int(); #ifdef __clang__ else if constexpr (_Bytes == sizeof(char)) return char(); #else else if constexpr (_Bytes == sizeof(_SChar)) return _SChar(); #endif else if constexpr (_Bytes == sizeof(short)) return short(); #ifndef __clang__ else if constexpr (_Bytes == sizeof(long)) return long(); #endif else if constexpr (_Bytes == sizeof(_LLong)) return _LLong(); #ifdef __SIZEOF_INT128__ else if constexpr (_Bytes == sizeof(__int128)) return __int128(); #endif // __SIZEOF_INT128__ else if constexpr (_Bytes % sizeof(int) == 0) { constexpr size_t _Np = _Bytes / sizeof(int); struct _Ip { int _M_data[_Np]; _GLIBCXX_SIMD_INTRINSIC constexpr _Ip operator&(_Ip __rhs) const { return __generate_from_n_evaluations<_Np, _Ip>( [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { return __rhs._M_data[__i] & _M_data[__i]; }); } _GLIBCXX_SIMD_INTRINSIC constexpr _Ip operator|(_Ip __rhs) const { return __generate_from_n_evaluations<_Np, _Ip>( [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { return __rhs._M_data[__i] | _M_data[__i]; }); } _GLIBCXX_SIMD_INTRINSIC constexpr _Ip operator^(_Ip __rhs) const { return __generate_from_n_evaluations<_Np, _Ip>( [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { return __rhs._M_data[__i] ^ _M_data[__i]; }); } _GLIBCXX_SIMD_INTRINSIC constexpr _Ip operator~() const { return __generate_from_n_evaluations<_Np, _Ip>( [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { return ~_M_data[__i]; }); } }; return _Ip{}; } else static_assert(_Bytes == 0, "this should be unreachable"); } #pragma GCC diagnostic pop template using __int_for_sizeof_t = decltype(__int_for_sizeof()); template using __int_with_sizeof_t = decltype(__int_for_sizeof<_Np>()); // }}} // __is_fixed_size_abi{{{ template struct __is_fixed_size_abi : false_type {}; template struct __is_fixed_size_abi> : true_type {}; template inline constexpr bool __is_fixed_size_abi_v = __is_fixed_size_abi<_Tp>::value; // }}} // __is_scalar_abi {{{ template constexpr bool __is_scalar_abi() { return is_same_v; } // }}} // __abi_bytes_v {{{ template