// Copyright 2021 Google LLC // Copyright 2023,2024 Arm Limited and/or // its affiliates // SPDX-License-Identifier: Apache-2.0 // SPDX-License-Identifier: BSD-3-Clause // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Target-independent types/functions defined after target-specific ops. // The "include guards" in this file that check HWY_TARGET_TOGGLE serve to skip // the generic implementation here if native ops are already defined. #include "hwy/base.h" // Define detail::Shuffle1230 etc, but only when viewing the current header; // normally this is included via highway.h, which includes ops/*.h. #if HWY_IDE && !defined(HWY_HIGHWAY_INCLUDED) #include "hwy/detect_targets.h" #include "hwy/ops/emu128-inl.h" #endif // HWY_IDE // Relies on the external include guard in highway.h. HWY_BEFORE_NAMESPACE(); namespace hwy { namespace HWY_NAMESPACE { // The lane type of a vector type, e.g. float for Vec>. template using LaneType = decltype(GetLane(V())); // Vector type, e.g. Vec128 for CappedTag. Useful as the return // type of functions that do not take a vector argument, or as an argument type // if the function only has a template argument for D, or for explicit type // names instead of auto. This may be a built-in type. template using Vec = decltype(Zero(D())); // Mask type. Useful as the return type of functions that do not take a mask // argument, or as an argument type if the function only has a template argument // for D, or for explicit type names instead of auto. template using Mask = decltype(MaskFromVec(Zero(D()))); // Returns the closest value to v within [lo, hi]. template HWY_API V Clamp(const V v, const V lo, const V hi) { return Min(Max(lo, v), hi); } // CombineShiftRightBytes (and -Lanes) are not available for the scalar target, // and RVV has its own implementation of -Lanes. #if (HWY_TARGET != HWY_SCALAR && HWY_TARGET != HWY_RVV) || HWY_IDE template HWY_API VFromD CombineShiftRightLanes(D d, VFromD hi, VFromD lo) { constexpr size_t kBytes = kLanes * sizeof(TFromD); static_assert(kBytes < 16, "Shift count is per-block"); return CombineShiftRightBytes(d, hi, lo); } #endif // Returns lanes with the most significant bit set and all other bits zero. template HWY_API Vec SignBit(D d) { const RebindToUnsigned du; return BitCast(d, Set(du, SignMask>())); } // Returns quiet NaN. template HWY_API Vec NaN(D d) { const RebindToSigned di; // LimitsMax sets all exponent and mantissa bits to 1. The exponent plus // mantissa MSB (to indicate quiet) would be sufficient. return BitCast(d, Set(di, LimitsMax>())); } // Returns positive infinity. template HWY_API Vec Inf(D d) { const RebindToUnsigned du; using T = TFromD; using TU = TFromD; const TU max_x2 = static_cast(MaxExponentTimes2()); return BitCast(d, Set(du, max_x2 >> 1)); } // ------------------------------ MaskedSetOr/MaskedSet template , typename D = DFromV, typename M = MFromD> HWY_API V MaskedSetOr(V no, M m, T a) { D d; return IfThenElse(m, Set(d, a), no); } template , typename M = MFromD, typename T = TFromD> HWY_API V MaskedSet(D d, M m, T a) { return IfThenElseZero(m, Set(d, a)); } // ------------------------------ ZeroExtendResizeBitCast // The implementation of detail::ZeroExtendResizeBitCast for the HWY_EMU128 // target is in emu128-inl.h, and the implementation of // detail::ZeroExtendResizeBitCast for the HWY_SCALAR target is in scalar-inl.h #if HWY_TARGET != HWY_EMU128 && HWY_TARGET != HWY_SCALAR namespace detail { #if HWY_HAVE_SCALABLE template HWY_INLINE VFromD ZeroExtendResizeBitCast( hwy::SizeTag /* from_size_tag */, hwy::SizeTag /* to_size_tag */, DTo d_to, DFrom d_from, VFromD v) { const Repartition d_to_u8; const auto resized = ResizeBitCast(d_to_u8, v); // Zero the upper bytes which were not present/valid in d_from. const size_t num_bytes = Lanes(Repartition()); return BitCast(d_to, IfThenElseZero(FirstN(d_to_u8, num_bytes), resized)); } #else // target that uses fixed-size vectors // Truncating or same-size resizing cast: same as ResizeBitCast template HWY_INLINE VFromD ZeroExtendResizeBitCast( hwy::SizeTag /* from_size_tag */, hwy::SizeTag /* to_size_tag */, DTo d_to, DFrom /*d_from*/, VFromD v) { return ResizeBitCast(d_to, v); } // Resizing cast to vector that has twice the number of lanes of the source // vector template HWY_INLINE VFromD ZeroExtendResizeBitCast( hwy::SizeTag /* from_size_tag */, hwy::SizeTag /* to_size_tag */, DTo d_to, DFrom d_from, VFromD v) { const Twice dt_from; return BitCast(d_to, ZeroExtendVector(dt_from, v)); } // Resizing cast to vector that has more than twice the number of lanes of the // source vector template HWY_INLINE VFromD ZeroExtendResizeBitCast( hwy::SizeTag /* from_size_tag */, hwy::SizeTag /* to_size_tag */, DTo d_to, DFrom /*d_from*/, VFromD v) { using TFrom = TFromD; constexpr size_t kNumOfFromLanes = kFromVectSize / sizeof(TFrom); const Repartition d_resize_to; return BitCast(d_to, IfThenElseZero(FirstN(d_resize_to, kNumOfFromLanes), ResizeBitCast(d_resize_to, v))); } #endif // HWY_HAVE_SCALABLE } // namespace detail #endif // HWY_TARGET != HWY_EMU128 && HWY_TARGET != HWY_SCALAR template HWY_API VFromD ZeroExtendResizeBitCast(DTo d_to, DFrom d_from, VFromD v) { return detail::ZeroExtendResizeBitCast(hwy::SizeTag(), hwy::SizeTag(), d_to, d_from, v); } // ------------------------------ SafeFillN template > HWY_API void SafeFillN(const size_t num, const T value, D d, T* HWY_RESTRICT to) { #if HWY_MEM_OPS_MIGHT_FAULT (void)d; for (size_t i = 0; i < num; ++i) { to[i] = value; } #else BlendedStore(Set(d, value), FirstN(d, num), d, to); #endif } // ------------------------------ SafeCopyN template > HWY_API void SafeCopyN(const size_t num, D d, const T* HWY_RESTRICT from, T* HWY_RESTRICT to) { #if HWY_MEM_OPS_MIGHT_FAULT (void)d; for (size_t i = 0; i < num; ++i) { to[i] = from[i]; } #else const Mask mask = FirstN(d, num); BlendedStore(MaskedLoad(mask, d, from), mask, d, to); #endif } // ------------------------------ IsNegative #if (defined(HWY_NATIVE_IS_NEGATIVE) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_IS_NEGATIVE #undef HWY_NATIVE_IS_NEGATIVE #else #define HWY_NATIVE_IS_NEGATIVE #endif template HWY_API Mask> IsNegative(V v) { const DFromV d; const RebindToSigned di; return RebindMask(d, MaskFromVec(BroadcastSignBit(BitCast(di, v)))); } #endif // HWY_NATIVE_IS_NEGATIVE // ------------------------------ MaskFalse #if (defined(HWY_NATIVE_MASK_FALSE) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_MASK_FALSE #undef HWY_NATIVE_MASK_FALSE #else #define HWY_NATIVE_MASK_FALSE #endif template HWY_API Mask MaskFalse(D d) { return MaskFromVec(Zero(d)); } #endif // HWY_NATIVE_MASK_FALSE // ------------------------------ IfNegativeThenElseZero #if (defined(HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO #undef HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO #else #define HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO #endif template HWY_API V IfNegativeThenElseZero(V v, V yes) { return IfThenElseZero(IsNegative(v), yes); } #endif // HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO // ------------------------------ IfNegativeThenZeroElse #if (defined(HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE #undef HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE #else #define HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE #endif template HWY_API V IfNegativeThenZeroElse(V v, V no) { return IfThenZeroElse(IsNegative(v), no); } #endif // HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE // ------------------------------ ZeroIfNegative (IfNegativeThenZeroElse) // ZeroIfNegative is generic for all vector lengths template HWY_API V ZeroIfNegative(V v) { return IfNegativeThenZeroElse(v, v); } // ------------------------------ BitwiseIfThenElse #if (defined(HWY_NATIVE_BITWISE_IF_THEN_ELSE) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_BITWISE_IF_THEN_ELSE #undef HWY_NATIVE_BITWISE_IF_THEN_ELSE #else #define HWY_NATIVE_BITWISE_IF_THEN_ELSE #endif template HWY_API V BitwiseIfThenElse(V mask, V yes, V no) { return Or(And(mask, yes), AndNot(mask, no)); } #endif // HWY_NATIVE_BITWISE_IF_THEN_ELSE // ------------------------------ PromoteMaskTo #if (defined(HWY_NATIVE_PROMOTE_MASK_TO) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_PROMOTE_MASK_TO #undef HWY_NATIVE_PROMOTE_MASK_TO #else #define HWY_NATIVE_PROMOTE_MASK_TO #endif template HWY_API Mask PromoteMaskTo(DTo d_to, DFrom d_from, Mask m) { static_assert( sizeof(TFromD) > sizeof(TFromD), "sizeof(TFromD) must be greater than sizeof(TFromD)"); static_assert( IsSame, Mask, DTo>>>(), "Mask must be the same type as Mask, DTo>>"); const RebindToSigned di_to; const RebindToSigned di_from; return MaskFromVec(BitCast( d_to, PromoteTo(di_to, BitCast(di_from, VecFromMask(d_from, m))))); } #endif // HWY_NATIVE_PROMOTE_MASK_TO // ------------------------------ DemoteMaskTo #if (defined(HWY_NATIVE_DEMOTE_MASK_TO) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_DEMOTE_MASK_TO #undef HWY_NATIVE_DEMOTE_MASK_TO #else #define HWY_NATIVE_DEMOTE_MASK_TO #endif template HWY_API Mask DemoteMaskTo(DTo d_to, DFrom d_from, Mask m) { static_assert(sizeof(TFromD) < sizeof(TFromD), "sizeof(TFromD) must be less than sizeof(TFromD)"); static_assert( IsSame, Mask, DTo>>>(), "Mask must be the same type as Mask, DTo>>"); const RebindToSigned di_to; const RebindToSigned di_from; return MaskFromVec( BitCast(d_to, DemoteTo(di_to, BitCast(di_from, VecFromMask(d_from, m))))); } #endif // HWY_NATIVE_DEMOTE_MASK_TO // ------------------------------ InsertIntoUpper #if (defined(HWY_NATIVE_LOAD_HIGHER) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_LOAD_HIGHER #undef HWY_NATIVE_LOAD_HIGHER #else #define HWY_NATIVE_LOAD_HIGHER #endif template (), HWY_IF_LANES_GT_D(D, 1), HWY_IF_POW2_GT_D(D, -3)> HWY_API V InsertIntoUpper(D d, T* p, V a) { Half dh; const VFromD b = LoadU(dh, p); return Combine(d, b, LowerHalf(a)); } #endif // HWY_NATIVE_LOAD_HIGHER // ------------------------------ CombineMasks #if (defined(HWY_NATIVE_COMBINE_MASKS) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_COMBINE_MASKS #undef HWY_NATIVE_COMBINE_MASKS #else #define HWY_NATIVE_COMBINE_MASKS #endif #if HWY_TARGET != HWY_SCALAR || HWY_IDE template HWY_API Mask CombineMasks(D d, Mask> hi, Mask> lo) { const Half dh; return MaskFromVec(Combine(d, VecFromMask(dh, hi), VecFromMask(dh, lo))); } #endif #endif // HWY_NATIVE_COMBINE_MASKS // ------------------------------ LowerHalfOfMask #if (defined(HWY_NATIVE_LOWER_HALF_OF_MASK) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_LOWER_HALF_OF_MASK #undef HWY_NATIVE_LOWER_HALF_OF_MASK #else #define HWY_NATIVE_LOWER_HALF_OF_MASK #endif template HWY_API Mask LowerHalfOfMask(D d, Mask> m) { const Twice dt; return MaskFromVec(LowerHalf(d, VecFromMask(dt, m))); } #endif // HWY_NATIVE_LOWER_HALF_OF_MASK // ------------------------------ UpperHalfOfMask #if (defined(HWY_NATIVE_UPPER_HALF_OF_MASK) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_UPPER_HALF_OF_MASK #undef HWY_NATIVE_UPPER_HALF_OF_MASK #else #define HWY_NATIVE_UPPER_HALF_OF_MASK #endif #if HWY_TARGET != HWY_SCALAR || HWY_IDE template HWY_API Mask UpperHalfOfMask(D d, Mask> m) { const Twice dt; return MaskFromVec(UpperHalf(d, VecFromMask(dt, m))); } #endif #endif // HWY_NATIVE_UPPER_HALF_OF_MASK // ------------------------------ OrderedDemote2MasksTo #if (defined(HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO) == \ defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO #undef HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO #else #define HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO #endif #if HWY_TARGET != HWY_SCALAR || HWY_IDE template HWY_API Mask OrderedDemote2MasksTo(DTo d_to, DFrom d_from, Mask a, Mask b) { static_assert( sizeof(TFromD) == sizeof(TFromD) / 2, "sizeof(TFromD) must be equal to sizeof(TFromD) / 2"); static_assert(IsSame, Mask, DFrom>>>(), "Mask must be the same type as " "Mask, DFrom>>>()"); const RebindToSigned di_from; const RebindToSigned di_to; const auto va = BitCast(di_from, VecFromMask(d_from, a)); const auto vb = BitCast(di_from, VecFromMask(d_from, b)); return MaskFromVec(BitCast(d_to, OrderedDemote2To(di_to, va, vb))); } #endif #endif // HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO // ------------------------------ RotateLeft template HWY_API V RotateLeft(V v) { constexpr size_t kSizeInBits = sizeof(TFromV) * 8; static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count"); constexpr int kRotateRightAmt = (kBits == 0) ? 0 : static_cast(kSizeInBits) - kBits; return RotateRight(v); } // ------------------------------ InterleaveWholeLower/InterleaveWholeUpper #if (defined(HWY_NATIVE_INTERLEAVE_WHOLE) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_INTERLEAVE_WHOLE #undef HWY_NATIVE_INTERLEAVE_WHOLE #else #define HWY_NATIVE_INTERLEAVE_WHOLE #endif #if HWY_TARGET != HWY_SCALAR || HWY_IDE template HWY_API VFromD InterleaveWholeLower(D d, VFromD a, VFromD b) { // InterleaveWholeLower(d, a, b) is equivalent to InterleaveLower(a, b) if // D().MaxBytes() <= 16 is true return InterleaveLower(d, a, b); } template HWY_API VFromD InterleaveWholeUpper(D d, VFromD a, VFromD b) { // InterleaveWholeUpper(d, a, b) is equivalent to InterleaveUpper(a, b) if // D().MaxBytes() <= 16 is true return InterleaveUpper(d, a, b); } // InterleaveWholeLower/InterleaveWholeUpper for 32-byte vectors on AVX2/AVX3 // is implemented in x86_256-inl.h. // InterleaveWholeLower/InterleaveWholeUpper for 64-byte vectors on AVX3 is // implemented in x86_512-inl.h. // InterleaveWholeLower/InterleaveWholeUpper for 32-byte vectors on WASM_EMU256 // is implemented in wasm_256-inl.h. #endif // HWY_TARGET != HWY_SCALAR #endif // HWY_NATIVE_INTERLEAVE_WHOLE #if HWY_TARGET != HWY_SCALAR || HWY_IDE // The InterleaveWholeLower without the optional D parameter is generic for all // vector lengths. template HWY_API V InterleaveWholeLower(V a, V b) { return InterleaveWholeLower(DFromV(), a, b); } #endif // HWY_TARGET != HWY_SCALAR // ------------------------------ InterleaveEven #if HWY_TARGET != HWY_SCALAR || HWY_IDE // InterleaveEven without the optional D parameter is generic for all vector // lengths template HWY_API V InterleaveEven(V a, V b) { return InterleaveEven(DFromV(), a, b); } #endif // ------------------------------ MinNumber/MaxNumber #if (defined(HWY_NATIVE_FLOAT_MIN_MAX_NUMBER) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_FLOAT_MIN_MAX_NUMBER #undef HWY_NATIVE_FLOAT_MIN_MAX_NUMBER #else #define HWY_NATIVE_FLOAT_MIN_MAX_NUMBER #endif template HWY_API V MinNumber(V a, V b) { return Min(a, b); } template HWY_API V MaxNumber(V a, V b) { return Max(a, b); } #endif template HWY_API V MinNumber(V a, V b) { return Min(a, b); } template HWY_API V MaxNumber(V a, V b) { return Max(a, b); } // ------------------------------ MinMagnitude/MaxMagnitude #if (defined(HWY_NATIVE_FLOAT_MIN_MAX_MAGNITUDE) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_FLOAT_MIN_MAX_MAGNITUDE #undef HWY_NATIVE_FLOAT_MIN_MAX_MAGNITUDE #else #define HWY_NATIVE_FLOAT_MIN_MAX_MAGNITUDE #endif template HWY_API V MinMagnitude(V a, V b) { const V abs_a = Abs(a); const V abs_b = Abs(b); const V min = Min(IfThenElse(Eq(abs_a, abs_b), a, b), b); return IfThenElse(Lt(abs_a, abs_b), a, min); } template HWY_API V MaxMagnitude(V a, V b) { const V abs_a = Abs(a); const V abs_b = Abs(b); // This lvalue appears to be necessary to avoid a clang bug on SVE. const V max = Max(IfThenElse(Eq(abs_a, abs_b), b, a), a); return IfThenElse(Lt(abs_a, abs_b), b, max); } #endif // HWY_NATIVE_FLOAT_MIN_MAX_MAGNITUDE template HWY_API V MinMagnitude(V a, V b) { const DFromV d; const RebindToUnsigned du; const auto abs_a = BitCast(du, Abs(a)); const auto abs_b = BitCast(du, Abs(b)); return IfThenElse(RebindMask(d, Lt(abs_a, abs_b)), a, Min(IfThenElse(RebindMask(d, Eq(abs_a, abs_b)), a, b), b)); } template HWY_API V MaxMagnitude(V a, V b) { const DFromV d; const RebindToUnsigned du; const auto abs_a = BitCast(du, Abs(a)); const auto abs_b = BitCast(du, Abs(b)); return IfThenElse(RebindMask(d, Lt(abs_a, abs_b)), b, Max(IfThenElse(RebindMask(d, Eq(abs_a, abs_b)), b, a), a)); } template HWY_API V MinMagnitude(V a, V b) { return Min(a, b); } template HWY_API V MaxMagnitude(V a, V b) { return Max(a, b); } // ------------------------------ AddSub template , 1)> HWY_API V AddSub(V a, V b) { // AddSub(a, b) for a one-lane vector is equivalent to Sub(a, b) return Sub(a, b); } // AddSub for F32x2, F32x4, and F64x2 vectors is implemented in x86_128-inl.h on // SSSE3/SSE4/AVX2/AVX3 // AddSub for F32x8 and F64x4 vectors is implemented in x86_256-inl.h on // AVX2/AVX3 // AddSub for F16/F32/F64 vectors on SVE is implemented in arm_sve-inl.h // AddSub for integer vectors on SVE2 is implemented in arm_sve-inl.h template HWY_API V AddSub(V a, V b) { using D = DFromV; using T = TFromD; using TNegate = If(), MakeSigned, T>; const D d; const Rebind d_negate; // Negate the even lanes of b const auto negated_even_b = OddEven(b, BitCast(d, Neg(BitCast(d_negate, b)))); return Add(a, negated_even_b); } // ------------------------------ MaskedAddOr etc. #if (defined(HWY_NATIVE_MASKED_ARITH) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_MASKED_ARITH #undef HWY_NATIVE_MASKED_ARITH #else #define HWY_NATIVE_MASKED_ARITH #endif template HWY_API V MaskedMinOr(V no, M m, V a, V b) { return IfThenElse(m, Min(a, b), no); } template HWY_API V MaskedMaxOr(V no, M m, V a, V b) { return IfThenElse(m, Max(a, b), no); } template HWY_API V MaskedAddOr(V no, M m, V a, V b) { return IfThenElse(m, Add(a, b), no); } template HWY_API V MaskedSubOr(V no, M m, V a, V b) { return IfThenElse(m, Sub(a, b), no); } template HWY_API V MaskedMulOr(V no, M m, V a, V b) { return IfThenElse(m, Mul(a, b), no); } template HWY_API V MaskedDivOr(V no, M m, V a, V b) { const DFromV d; // Avoid division by zero for masked-out lanes. const V nonzero = Set(d, TFromD{1}); return IfThenElse(m, Div(a, IfThenElse(m, b, nonzero)), no); } template HWY_API V MaskedModOr(V no, M m, V a, V b) { const DFromV d; // Avoid division by zero for masked-out lanes. const V nonzero = Set(d, TFromD{1}); return IfThenElse(m, Mod(a, IfThenElse(m, b, nonzero)), no); } template HWY_API V MaskedSatAddOr(V no, M m, V a, V b) { return IfThenElse(m, SaturatedAdd(a, b), no); } template HWY_API V MaskedSatSubOr(V no, M m, V a, V b) { return IfThenElse(m, SaturatedSub(a, b), no); } #endif // HWY_NATIVE_MASKED_ARITH #if (defined(HWY_NATIVE_ZERO_MASKED_ARITH) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_ZERO_MASKED_ARITH #undef HWY_NATIVE_ZERO_MASKED_ARITH #else #define HWY_NATIVE_ZERO_MASKED_ARITH #endif template HWY_API V MaskedMax(M m, V a, V b) { return IfThenElseZero(m, (Max(a, b))); } template HWY_API V MaskedAdd(M m, V a, V b) { return IfThenElseZero(m, Add(a, b)); } template HWY_API V MaskedSub(M m, V a, V b) { return IfThenElseZero(m, Sub(a, b)); } template HWY_API V MaskedMul(M m, V a, V b) { return IfThenElseZero(m, Mul(a, b)); } template HWY_API V MaskedDiv(M m, V a, V b) { return IfThenElseZero(m, Div(a, b)); } template HWY_API V MaskedSaturatedAdd(M m, V a, V b) { return IfThenElseZero(m, SaturatedAdd(a, b)); } template HWY_API V MaskedSaturatedSub(M m, V a, V b) { return IfThenElseZero(m, SaturatedSub(a, b)); } template , HWY_IF_I16_D(D)> HWY_API V MaskedMulFixedPoint15(M m, V a, V b) { return IfThenElseZero(m, MulFixedPoint15(a, b)); } template HWY_API V MaskedMulAdd(M m, V mul, V x, V add) { return IfThenElseZero(m, MulAdd(mul, x, add)); } template HWY_API V MaskedNegMulAdd(M m, V mul, V x, V add) { return IfThenElseZero(m, NegMulAdd(mul, x, add)); } template >> HWY_API VFromD MaskedWidenMulPairwiseAdd(D d32, M m, V16 a, V16 b) { return IfThenElseZero(m, WidenMulPairwiseAdd(d32, a, b)); } template HWY_API VFromD MaskedWidenMulPairwiseAdd(DF df, M m, VBF a, VBF b) { return IfThenElseZero(m, WidenMulPairwiseAdd(df, a, b)); } #endif // HWY_NATIVE_ZERO_MASKED_ARITH // ------------------------------ MaskedShift template HWY_API V MaskedShiftLeft(M m, V a) { return IfThenElseZero(m, ShiftLeft(a)); } template HWY_API V MaskedShiftRight(M m, V a) { return IfThenElseZero(m, ShiftRight(a)); } template HWY_API V MaskedShiftRightOr(V no, M m, V a) { return IfThenElse(m, ShiftRight(a), no); } template HWY_API V MaskedShrOr(V no, M m, V a, V shifts) { return IfThenElse(m, Shr(a, shifts), no); } // ------------------------------ MaskedEq etc. #if (defined(HWY_NATIVE_MASKED_COMP) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_MASKED_COMP #undef HWY_NATIVE_MASKED_COMP #else #define HWY_NATIVE_MASKED_COMP #endif template HWY_API auto MaskedEq(M m, V a, V b) -> decltype(a == b) { return And(m, Eq(a, b)); } template HWY_API auto MaskedNe(M m, V a, V b) -> decltype(a == b) { return And(m, Ne(a, b)); } template HWY_API auto MaskedLt(M m, V a, V b) -> decltype(a == b) { return And(m, Lt(a, b)); } template HWY_API auto MaskedGt(M m, V a, V b) -> decltype(a == b) { return And(m, Gt(a, b)); } template HWY_API auto MaskedLe(M m, V a, V b) -> decltype(a == b) { return And(m, Le(a, b)); } template HWY_API auto MaskedGe(M m, V a, V b) -> decltype(a == b) { return And(m, Ge(a, b)); } template > HWY_API MFromD MaskedIsNaN(const M m, const V v) { return And(m, IsNaN(v)); } #endif // HWY_NATIVE_MASKED_COMP // ------------------------------ IfNegativeThenNegOrUndefIfZero #if (defined(HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG) == \ defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG #undef HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG #else #define HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG #endif template HWY_API V IfNegativeThenNegOrUndefIfZero(V mask, V v) { #if HWY_HAVE_SCALABLE || HWY_TARGET_IS_SVE // MaskedSubOr is more efficient than IfNegativeThenElse on RVV/SVE const auto zero = Zero(DFromV()); return MaskedSubOr(v, Lt(mask, zero), zero, v); #else return IfNegativeThenElse(mask, Neg(v), v); #endif } #endif // HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG template HWY_API V IfNegativeThenNegOrUndefIfZero(V mask, V v) { return CopySign(v, Xor(mask, v)); } // ------------------------------ SaturatedNeg #if (defined(HWY_NATIVE_SATURATED_NEG_8_16_32) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_SATURATED_NEG_8_16_32 #undef HWY_NATIVE_SATURATED_NEG_8_16_32 #else #define HWY_NATIVE_SATURATED_NEG_8_16_32 #endif template HWY_API V SaturatedNeg(V v) { const DFromV d; return SaturatedSub(Zero(d), v); } template )> HWY_API V SaturatedNeg(V v) { const DFromV d; #if HWY_TARGET == HWY_RVV || HWY_TARGET_IS_PPC || HWY_TARGET_IS_SVE || \ HWY_TARGET_IS_NEON // RVV/PPC/SVE/NEON have native I32 SaturatedSub instructions return SaturatedSub(Zero(d), v); #else // ~v[i] - ((v[i] > LimitsMin()) ? -1 : 0) is equivalent to // (v[i] > LimitsMin) ? (-v[i]) : LimitsMax() since // -v[i] == ~v[i] + 1 == ~v[i] - (-1) and // ~LimitsMin() == LimitsMax(). return Sub(Not(v), VecFromMask(d, Gt(v, Set(d, LimitsMin())))); #endif } #endif // HWY_NATIVE_SATURATED_NEG_8_16_32 #if (defined(HWY_NATIVE_SATURATED_NEG_64) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_SATURATED_NEG_64 #undef HWY_NATIVE_SATURATED_NEG_64 #else #define HWY_NATIVE_SATURATED_NEG_64 #endif template )> HWY_API V SaturatedNeg(V v) { #if HWY_TARGET == HWY_RVV || HWY_TARGET_IS_SVE || HWY_TARGET_IS_NEON // RVV/SVE/NEON have native I64 SaturatedSub instructions const DFromV d; return SaturatedSub(Zero(d), v); #else const auto neg_v = Neg(v); return Add(neg_v, BroadcastSignBit(And(v, neg_v))); #endif } #endif // HWY_NATIVE_SATURATED_NEG_64 // ------------------------------ SaturatedAbs #if (defined(HWY_NATIVE_SATURATED_ABS) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_SATURATED_ABS #undef HWY_NATIVE_SATURATED_ABS #else #define HWY_NATIVE_SATURATED_ABS #endif template HWY_API V SaturatedAbs(V v) { return Max(v, SaturatedNeg(v)); } #endif // ------------------------------ MaskedAbsOr template HWY_API V MaskedAbsOr(V no, M m, V v) { return IfThenElse(m, Abs(v), no); } // ------------------------------ MaskedAbs template HWY_API V MaskedAbs(M m, V v) { return IfThenElseZero(m, Abs(v)); } // ------------------------------ Reductions // Targets follow one of two strategies. If HWY_NATIVE_REDUCE_SCALAR is toggled, // they (RVV/SVE/Armv8/Emu128) implement ReduceSum and SumOfLanes via Set. // Otherwise, they (Armv7/PPC/scalar/WASM/x86) define zero to most of the // SumOfLanes overloads. For the latter group, we here define the remaining // overloads, plus ReduceSum which uses them plus GetLane. #if (defined(HWY_NATIVE_REDUCE_SCALAR) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_REDUCE_SCALAR #undef HWY_NATIVE_REDUCE_SCALAR #else #define HWY_NATIVE_REDUCE_SCALAR #endif namespace detail { // Allows reusing the same shuffle code for SumOfLanes/MinOfLanes/MaxOfLanes. struct AddFunc { template V operator()(V a, V b) const { return Add(a, b); } }; struct MinFunc { template V operator()(V a, V b) const { return Min(a, b); } }; struct MaxFunc { template V operator()(V a, V b) const { return Max(a, b); } }; // No-op for vectors of at most one block. template HWY_INLINE VFromD ReduceAcrossBlocks(D, Func, VFromD v) { return v; } // Reduces a lane with its counterpart in other block(s). Shared by AVX2 and // WASM_EMU256. AVX3 has its own overload. template HWY_INLINE VFromD ReduceAcrossBlocks(D /*d*/, Func f, VFromD v) { return f(v, SwapAdjacentBlocks(v)); } // These return the reduction result broadcasted across all lanes. They assume // the caller has already reduced across blocks. template HWY_INLINE VFromD ReduceWithinBlocks(D d, Func f, VFromD v10) { return f(v10, Reverse2(d, v10)); } template HWY_INLINE VFromD ReduceWithinBlocks(D d, Func f, VFromD v3210) { const VFromD v0123 = Reverse4(d, v3210); const VFromD v03_12_12_03 = f(v3210, v0123); const VFromD v12_03_03_12 = Reverse2(d, v03_12_12_03); return f(v03_12_12_03, v12_03_03_12); } template HWY_INLINE VFromD ReduceWithinBlocks(D d, Func f, VFromD v76543210) { // The upper half is reversed from the lower half; omit for brevity. const VFromD v34_25_16_07 = f(v76543210, Reverse8(d, v76543210)); const VFromD v0347_1625_1625_0347 = f(v34_25_16_07, Reverse4(d, v34_25_16_07)); return f(v0347_1625_1625_0347, Reverse2(d, v0347_1625_1625_0347)); } template HWY_INLINE VFromD ReduceWithinBlocks(D d, Func f, VFromD v) { const RepartitionToWide dw; using VW = VFromD; const VW vw = BitCast(dw, v); // f is commutative, so no need to adapt for HWY_IS_LITTLE_ENDIAN. const VW even = And(vw, Set(dw, 0xFF)); const VW odd = ShiftRight<8>(vw); const VW reduced = ReduceWithinBlocks(dw, f, f(even, odd)); #if HWY_IS_LITTLE_ENDIAN return DupEven(BitCast(d, reduced)); #else return DupOdd(BitCast(d, reduced)); #endif } template HWY_INLINE VFromD ReduceWithinBlocks(D d, Func f, VFromD v) { const RepartitionToWide dw; using VW = VFromD; const VW vw = BitCast(dw, v); // Sign-extend // f is commutative, so no need to adapt for HWY_IS_LITTLE_ENDIAN. const VW even = ShiftRight<8>(ShiftLeft<8>(vw)); const VW odd = ShiftRight<8>(vw); const VW reduced = ReduceWithinBlocks(dw, f, f(even, odd)); #if HWY_IS_LITTLE_ENDIAN return DupEven(BitCast(d, reduced)); #else return DupOdd(BitCast(d, reduced)); #endif } } // namespace detail template HWY_API VFromD SumOfLanes(D d, VFromD v) { const detail::AddFunc f; v = detail::ReduceAcrossBlocks(d, f, v); return detail::ReduceWithinBlocks(d, f, v); } template HWY_API VFromD MinOfLanes(D d, VFromD v) { const detail::MinFunc f; v = detail::ReduceAcrossBlocks(d, f, v); return detail::ReduceWithinBlocks(d, f, v); } template HWY_API VFromD MaxOfLanes(D d, VFromD v) { const detail::MaxFunc f; v = detail::ReduceAcrossBlocks(d, f, v); return detail::ReduceWithinBlocks(d, f, v); } template HWY_API TFromD ReduceSum(D d, VFromD v) { return GetLane(SumOfLanes(d, v)); } template HWY_API TFromD ReduceMin(D d, VFromD v) { return GetLane(MinOfLanes(d, v)); } template HWY_API TFromD ReduceMax(D d, VFromD v) { return GetLane(MaxOfLanes(d, v)); } #endif // HWY_NATIVE_REDUCE_SCALAR // Corner cases for both generic and native implementations: // N=1 (native covers N=2 e.g. for u64x2 and even u32x2 on Arm) template HWY_API TFromD ReduceSum(D /*d*/, VFromD v) { return GetLane(v); } template HWY_API TFromD ReduceMin(D /*d*/, VFromD v) { return GetLane(v); } template HWY_API TFromD ReduceMax(D /*d*/, VFromD v) { return GetLane(v); } template HWY_API VFromD SumOfLanes(D /* tag */, VFromD v) { return v; } template HWY_API VFromD MinOfLanes(D /* tag */, VFromD v) { return v; } template HWY_API VFromD MaxOfLanes(D /* tag */, VFromD v) { return v; } // N=4 for 8-bit is still less than the minimum native size. // ARMv7 NEON/PPC/RVV/SVE have target-specific implementations of the N=4 I8/U8 // ReduceSum operations #if (defined(HWY_NATIVE_REDUCE_SUM_4_UI8) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_REDUCE_SUM_4_UI8 #undef HWY_NATIVE_REDUCE_SUM_4_UI8 #else #define HWY_NATIVE_REDUCE_SUM_4_UI8 #endif template HWY_API TFromD ReduceSum(D d, VFromD v) { const Twice> dw; return static_cast>(ReduceSum(dw, PromoteTo(dw, v))); } #endif // HWY_NATIVE_REDUCE_SUM_4_UI8 // RVV/SVE have target-specific implementations of the N=4 I8/U8 // ReduceMin/ReduceMax operations #if (defined(HWY_NATIVE_REDUCE_MINMAX_4_UI8) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_REDUCE_MINMAX_4_UI8 #undef HWY_NATIVE_REDUCE_MINMAX_4_UI8 #else #define HWY_NATIVE_REDUCE_MINMAX_4_UI8 #endif template HWY_API TFromD ReduceMin(D d, VFromD v) { const Twice> dw; return static_cast>(ReduceMin(dw, PromoteTo(dw, v))); } template HWY_API TFromD ReduceMax(D d, VFromD v) { const Twice> dw; return static_cast>(ReduceMax(dw, PromoteTo(dw, v))); } #endif // HWY_NATIVE_REDUCE_MINMAX_4_UI8 #if (defined(HWY_NATIVE_MASKED_REDUCE_SCALAR) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_MASKED_REDUCE_SCALAR #undef HWY_NATIVE_MASKED_REDUCE_SCALAR #else #define HWY_NATIVE_MASKED_REDUCE_SCALAR #endif template HWY_API TFromD MaskedReduceSum(D d, M m, VFromD v) { return ReduceSum(d, IfThenElseZero(m, v)); } template HWY_API TFromD MaskedReduceMin(D d, M m, VFromD v) { return ReduceMin( d, IfThenElse(m, v, Set(d, hwy::PositiveInfOrHighestValue>()))); } template HWY_API TFromD MaskedReduceMax(D d, M m, VFromD v) { return ReduceMax( d, IfThenElse(m, v, Set(d, hwy::NegativeInfOrLowestValue>()))); } #endif // HWY_NATIVE_MASKED_REDUCE_SCALAR // ------------------------------ IsEitherNaN #if (defined(HWY_NATIVE_IS_EITHER_NAN) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_IS_EITHER_NAN #undef HWY_NATIVE_IS_EITHER_NAN #else #define HWY_NATIVE_IS_EITHER_NAN #endif template HWY_API MFromD> IsEitherNaN(V a, V b) { return Or(IsNaN(a), IsNaN(b)); } #endif // HWY_NATIVE_IS_EITHER_NAN // ------------------------------ IsInf, IsFinite // AVX3 has target-specific implementations of these. #if (defined(HWY_NATIVE_ISINF) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_ISINF #undef HWY_NATIVE_ISINF #else #define HWY_NATIVE_ISINF #endif template > HWY_API MFromD IsInf(const V v) { using T = TFromD; const D d; const RebindToUnsigned du; const VFromD vu = BitCast(du, v); // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0. return RebindMask( d, Eq(Add(vu, vu), Set(du, static_cast>(hwy::MaxExponentTimes2())))); } // Returns whether normal/subnormal/zero. template > HWY_API MFromD IsFinite(const V v) { using T = TFromD; const D d; const RebindToUnsigned du; const RebindToSigned di; // cheaper than unsigned comparison const VFromD vu = BitCast(du, v); // 'Shift left' to clear the sign bit. MSVC seems to generate incorrect code // for AVX2 if we instead add vu + vu. #if HWY_COMPILER_MSVC const VFromD shl = ShiftLeft<1>(vu); #else const VFromD shl = Add(vu, vu); #endif // Then shift right so we can compare with the max exponent (cannot compare // with MaxExponentTimes2 directly because it is negative and non-negative // floats would be greater). const VFromD exp = BitCast(di, ShiftRight() + 1>(shl)); return RebindMask(d, Lt(exp, Set(di, hwy::MaxExponentField()))); } #endif // HWY_NATIVE_ISINF // ------------------------------ CeilInt/FloorInt #if (defined(HWY_NATIVE_CEIL_FLOOR_INT) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_CEIL_FLOOR_INT #undef HWY_NATIVE_CEIL_FLOOR_INT #else #define HWY_NATIVE_CEIL_FLOOR_INT #endif template HWY_API VFromD>> CeilInt(V v) { const DFromV d; const RebindToSigned di; return ConvertTo(di, Ceil(v)); } template HWY_API VFromD>> FloorInt(V v) { const DFromV d; const RebindToSigned di; return ConvertTo(di, Floor(v)); } #endif // HWY_NATIVE_CEIL_FLOOR_INT // ------------------------------ MulByPow2/MulByFloorPow2 #if (defined(HWY_NATIVE_MUL_BY_POW2) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_MUL_BY_POW2 #undef HWY_NATIVE_MUL_BY_POW2 #else #define HWY_NATIVE_MUL_BY_POW2 #endif template HWY_API V MulByPow2(V v, VFromD>> exp) { const DFromV df; const RebindToUnsigned du; const RebindToSigned di; using TF = TFromD; using TI = TFromD; using TU = TFromD; using VF = VFromD; using VI = VFromD; constexpr TI kMaxBiasedExp = MaxExponentField(); static_assert(kMaxBiasedExp > 0, "kMaxBiasedExp > 0 must be true"); constexpr TI kExpBias = static_cast(kMaxBiasedExp >> 1); static_assert(kExpBias > 0, "kExpBias > 0 must be true"); static_assert(kExpBias <= LimitsMax() / 3, "kExpBias <= LimitsMax() / 3 must be true"); #if HWY_TARGET > HWY_AVX3 && HWY_TARGET <= HWY_SSE4 using TExpMinMax = If<(sizeof(TI) <= 4), TI, int32_t>; #elif (HWY_TARGET >= HWY_SSSE3 && HWY_TARGET <= HWY_SSE2) || \ HWY_TARGET == HWY_WASM || HWY_TARGET == HWY_WASM_EMU256 using TExpMinMax = int16_t; #else using TExpMinMax = TI; #endif #if HWY_TARGET == HWY_EMU128 || HWY_TARGET == HWY_SCALAR using TExpSatSub = TU; #elif HWY_TARGET <= HWY_SSE2 || HWY_TARGET == HWY_WASM || \ HWY_TARGET == HWY_WASM_EMU256 using TExpSatSub = If<(sizeof(TF) == 4), uint8_t, uint16_t>; #elif HWY_TARGET_IS_PPC using TExpSatSub = If<(sizeof(TF) >= 4), uint32_t, TU>; #else using TExpSatSub = If<(sizeof(TF) == 4), uint8_t, TU>; #endif static_assert(kExpBias <= static_cast(LimitsMax() / 3), "kExpBias <= LimitsMax() / 3 must be true"); const Repartition d_exp_min_max; const Repartition d_sat_exp_sub; constexpr int kNumOfExpBits = ExponentBits(); constexpr int kNumOfMantBits = MantissaBits(); // The sign bit of BitCastScalar(a[i]) >> kNumOfMantBits can be zeroed out // using SaturatedSub if kZeroOutSignUsingSatSub is true. // If kZeroOutSignUsingSatSub is true, then val_for_exp_sub will be bitcasted // to a vector that has a smaller lane size than TU for the SaturatedSub // operation below. constexpr bool kZeroOutSignUsingSatSub = ((sizeof(TExpSatSub) * 8) == static_cast(kNumOfExpBits)); // If kZeroOutSignUsingSatSub is true, then the upper // (sizeof(TU) - sizeof(TExpSatSub)) * 8 bits of kExpDecrBy1Bits will be all // ones and the lower sizeof(TExpSatSub) * 8 bits of kExpDecrBy1Bits will be // equal to 1. // Otherwise, if kZeroOutSignUsingSatSub is false, kExpDecrBy1Bits will be // equal to 1. constexpr TU kExpDecrBy1Bits = static_cast( TU{1} - (static_cast(kZeroOutSignUsingSatSub) << kNumOfExpBits)); VF val_for_exp_sub = v; HWY_IF_CONSTEXPR(!kZeroOutSignUsingSatSub) { // If kZeroOutSignUsingSatSub is not true, zero out the sign bit of // val_for_exp_sub[i] using Abs val_for_exp_sub = Abs(val_for_exp_sub); } // min_exp1_plus_min_exp2[i] is the smallest exponent such that // min_exp1_plus_min_exp2[i] >= 2 - kExpBias * 2 and // std::ldexp(v[i], min_exp1_plus_min_exp2[i]) is a normal floating-point // number if v[i] is a normal number const VI min_exp1_plus_min_exp2 = BitCast( di, Max(BitCast( d_exp_min_max, Neg(BitCast( di, SaturatedSub( BitCast(d_sat_exp_sub, ShiftRight( BitCast(du, val_for_exp_sub))), BitCast(d_sat_exp_sub, Set(du, kExpDecrBy1Bits)))))), BitCast(d_exp_min_max, Set(di, static_cast(2 - kExpBias - kExpBias))))); const VI clamped_exp = Max(Min(exp, Set(di, static_cast(kExpBias * 3))), Add(min_exp1_plus_min_exp2, Set(di, static_cast(1 - kExpBias)))); const VI exp1_plus_exp2 = BitCast( di, Max(Min(BitCast(d_exp_min_max, Sub(clamped_exp, ShiftRight<2>(clamped_exp))), BitCast(d_exp_min_max, Set(di, static_cast(kExpBias + kExpBias)))), BitCast(d_exp_min_max, min_exp1_plus_min_exp2))); const VI exp1 = ShiftRight<1>(exp1_plus_exp2); const VI exp2 = Sub(exp1_plus_exp2, exp1); const VI exp3 = Sub(clamped_exp, exp1_plus_exp2); const VI exp_bias = Set(di, kExpBias); const VF factor1 = BitCast(df, ShiftLeft(Add(exp1, exp_bias))); const VF factor2 = BitCast(df, ShiftLeft(Add(exp2, exp_bias))); const VF factor3 = BitCast(df, ShiftLeft(Add(exp3, exp_bias))); return Mul(Mul(Mul(v, factor1), factor2), factor3); } template HWY_API V MulByFloorPow2(V v, V exp) { const DFromV df; // MulByFloorPow2 special cases: // MulByFloorPow2(v, NaN) => NaN // MulByFloorPow2(0, inf) => NaN // MulByFloorPow2(inf, -inf) => NaN // MulByFloorPow2(-inf, -inf) => NaN const auto is_special_case_with_nan_result = Or(IsNaN(exp), And(Eq(Abs(v), IfNegativeThenElseZero(exp, Inf(df))), IsInf(exp))); return IfThenElse(is_special_case_with_nan_result, NaN(df), MulByPow2(v, FloorInt(exp))); } #endif // HWY_NATIVE_MUL_BY_POW2 // ------------------------------ GetBiasedExponent #if (defined(HWY_NATIVE_GET_BIASED_EXPONENT) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_GET_BIASED_EXPONENT #undef HWY_NATIVE_GET_BIASED_EXPONENT #else #define HWY_NATIVE_GET_BIASED_EXPONENT #endif template HWY_API VFromD>> GetBiasedExponent(V v) { using T = TFromV; const DFromV d; const RebindToUnsigned du; constexpr int kNumOfMantBits = MantissaBits(); return ShiftRight(BitCast(du, Abs(v))); } #endif // ------------------------------ GetExponent #if (defined(HWY_NATIVE_GET_EXPONENT) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_GET_EXPONENT #undef HWY_NATIVE_GET_EXPONENT #else #define HWY_NATIVE_GET_EXPONENT #endif template HWY_API V GetExponent(V v) { const DFromV d; using T = TFromV; const RebindToSigned di; const auto exponent_offset = Set(di, MaxExponentField() >> 1); // extract exponent bits as integer const auto encoded_exponent = GetBiasedExponent(v); const auto exponent_int = Sub(BitCast(di, encoded_exponent), exponent_offset); // convert integer to original type return ConvertTo(d, exponent_int); } #endif // HWY_NATIVE_GET_EXPONENT // ------------------------------ LoadInterleaved2 #if HWY_IDE || \ (defined(HWY_NATIVE_LOAD_STORE_INTERLEAVED) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED #undef HWY_NATIVE_LOAD_STORE_INTERLEAVED #else #define HWY_NATIVE_LOAD_STORE_INTERLEAVED #endif template