// Copyright 2019 Google LLC // SPDX-License-Identifier: Apache-2.0 // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Single-element vectors and operations. // External include guard in highway.h - see comment there. #include #ifndef HWY_NO_LIBCXX #include // sqrtf #endif #include "hwy/ops/shared-inl.h" HWY_BEFORE_NAMESPACE(); namespace hwy { namespace HWY_NAMESPACE { // Single instruction, single data. template using Sisd = Simd; // (Wrapper class required for overloading comparison operators.) template struct Vec1 { using PrivateT = T; // only for DFromV static constexpr size_t kPrivateN = 1; // only for DFromV HWY_INLINE Vec1() = default; Vec1(const Vec1&) = default; Vec1& operator=(const Vec1&) = default; HWY_INLINE explicit Vec1(const T t) : raw(t) {} HWY_INLINE Vec1& operator*=(const Vec1 other) { return *this = (*this * other); } HWY_INLINE Vec1& operator/=(const Vec1 other) { return *this = (*this / other); } HWY_INLINE Vec1& operator+=(const Vec1 other) { return *this = (*this + other); } HWY_INLINE Vec1& operator-=(const Vec1 other) { return *this = (*this - other); } HWY_INLINE Vec1& operator%=(const Vec1 other) { return *this = (*this % other); } HWY_INLINE Vec1& operator&=(const Vec1 other) { return *this = (*this & other); } HWY_INLINE Vec1& operator|=(const Vec1 other) { return *this = (*this | other); } HWY_INLINE Vec1& operator^=(const Vec1 other) { return *this = (*this ^ other); } T raw; }; // 0 or FF..FF, same size as Vec1. template struct Mask1 { using Raw = hwy::MakeUnsigned; using PrivateT = T; // only for DFromM static constexpr size_t kPrivateN = 1; // only for DFromM static HWY_INLINE Mask1 FromBool(bool b) { Mask1 mask; mask.bits = b ? static_cast(~Raw{0}) : 0; return mask; } Raw bits; }; template using DFromV = Simd; template using DFromM = Simd; template using TFromV = typename V::PrivateT; // ------------------------------ BitCast template , typename TFrom> HWY_API Vec1 BitCast(DTo /* tag */, Vec1 v) { static_assert(sizeof(TTo) <= sizeof(TFrom), "Promoting is undefined"); TTo to; CopyBytes(&v.raw, &to); // not same size - ok to shrink return Vec1(to); } // ------------------------------ Zero template > HWY_API Vec1 Zero(D /* tag */) { return Vec1(ConvertScalarTo(0)); } template using VFromD = decltype(Zero(D())); // ------------------------------ Set template , typename T2> HWY_API Vec1 Set(D /* tag */, const T2 t) { return Vec1(static_cast(t)); } // ------------------------------ Undefined template > HWY_API Vec1 Undefined(D d) { return Zero(d); } // ------------------------------ Iota template , typename T2> HWY_API Vec1 Iota(const D /* tag */, const T2 first) { return Vec1(static_cast(first)); } // ------------------------------ ResizeBitCast template HWY_API VFromD ResizeBitCast(D /* tag */, FromV v) { using TFrom = TFromV; using TTo = TFromD; constexpr size_t kCopyLen = HWY_MIN(sizeof(TFrom), sizeof(TTo)); TTo to{}; CopyBytes(&v.raw, &to); return VFromD(to); } namespace detail { // ResizeBitCast on the HWY_SCALAR target has zero-extending semantics if // sizeof(TFromD) is greater than sizeof(TFromV) template HWY_INLINE VFromD ZeroExtendResizeBitCast(FromSizeTag /* from_size_tag */, ToSizeTag /* to_size_tag */, DTo d_to, DFrom /*d_from*/, VFromD v) { return ResizeBitCast(d_to, v); } } // namespace detail // ------------------------------ Dup128VecFromValues template HWY_API VFromD Dup128VecFromValues(D /*d*/, TFromD t0, TFromD /*t1*/, TFromD /*t2*/, TFromD /*t3*/, TFromD /*t4*/, TFromD /*t5*/, TFromD /*t6*/, TFromD /*t7*/, TFromD /*t8*/, TFromD /*t9*/, TFromD /*t10*/, TFromD /*t11*/, TFromD /*t12*/, TFromD /*t13*/, TFromD /*t14*/, TFromD /*t15*/) { return VFromD(t0); } template HWY_API VFromD Dup128VecFromValues(D /*d*/, TFromD t0, TFromD /*t1*/, TFromD /*t2*/, TFromD /*t3*/, TFromD /*t4*/, TFromD /*t5*/, TFromD /*t6*/, TFromD /*t7*/) { return VFromD(t0); } template HWY_API VFromD Dup128VecFromValues(D /*d*/, TFromD t0, TFromD /*t1*/, TFromD /*t2*/, TFromD /*t3*/) { return VFromD(t0); } template HWY_API VFromD Dup128VecFromValues(D /*d*/, TFromD t0, TFromD /*t1*/) { return VFromD(t0); } // ================================================== LOGICAL // ------------------------------ Not template HWY_API Vec1 Not(const Vec1 v) { using TU = MakeUnsigned; const Sisd du; return BitCast(Sisd(), Vec1(static_cast(~BitCast(du, v).raw))); } // ------------------------------ And template HWY_API Vec1 And(const Vec1 a, const Vec1 b) { using TU = MakeUnsigned; const Sisd du; return BitCast(Sisd(), Vec1(BitCast(du, a).raw & BitCast(du, b).raw)); } template HWY_API Vec1 operator&(const Vec1 a, const Vec1 b) { return And(a, b); } // ------------------------------ AndNot template HWY_API Vec1 AndNot(const Vec1 a, const Vec1 b) { using TU = MakeUnsigned; const Sisd du; return BitCast(Sisd(), Vec1(static_cast(~BitCast(du, a).raw & BitCast(du, b).raw))); } // ------------------------------ Or template HWY_API Vec1 Or(const Vec1 a, const Vec1 b) { using TU = MakeUnsigned; const Sisd du; return BitCast(Sisd(), Vec1(BitCast(du, a).raw | BitCast(du, b).raw)); } template HWY_API Vec1 operator|(const Vec1 a, const Vec1 b) { return Or(a, b); } // ------------------------------ Xor template HWY_API Vec1 Xor(const Vec1 a, const Vec1 b) { using TU = MakeUnsigned; const Sisd du; return BitCast(Sisd(), Vec1(BitCast(du, a).raw ^ BitCast(du, b).raw)); } template HWY_API Vec1 operator^(const Vec1 a, const Vec1 b) { return Xor(a, b); } // ------------------------------ Xor3 template HWY_API Vec1 Xor3(Vec1 x1, Vec1 x2, Vec1 x3) { return Xor(x1, Xor(x2, x3)); } // ------------------------------ Or3 template HWY_API Vec1 Or3(Vec1 o1, Vec1 o2, Vec1 o3) { return Or(o1, Or(o2, o3)); } // ------------------------------ OrAnd template HWY_API Vec1 OrAnd(const Vec1 o, const Vec1 a1, const Vec1 a2) { return Or(o, And(a1, a2)); } // ------------------------------ Mask template , typename TFrom> HWY_API Mask1 RebindMask(DTo /*tag*/, Mask1 m) { static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size"); return Mask1{m.bits}; } // v must be 0 or FF..FF. template HWY_API Mask1 MaskFromVec(const Vec1 v) { Mask1 mask; CopySameSize(&v, &mask); return mask; } template using MFromD = decltype(MaskFromVec(VFromD())); template > Vec1 VecFromMask(D /* tag */, const Mask1 mask) { Vec1 v; CopySameSize(&mask, &v); return v; } template uint64_t BitsFromMask(D, MFromD mask) { return mask.bits ? 1 : 0; } template > HWY_API Mask1 FirstN(D /*tag*/, size_t n) { return Mask1::FromBool(n != 0); } // ------------------------------ IfVecThenElse template HWY_API Vec1 IfVecThenElse(Vec1 mask, Vec1 yes, Vec1 no) { return IfThenElse(MaskFromVec(mask), yes, no); } // ------------------------------ CopySign template HWY_API Vec1 CopySign(const Vec1 magn, const Vec1 sign) { static_assert(IsFloat(), "Only makes sense for floating-point"); const DFromV d; return BitwiseIfThenElse(SignBit(d), sign, magn); } // ------------------------------ CopySignToAbs template HWY_API Vec1 CopySignToAbs(const Vec1 abs, const Vec1 sign) { static_assert(IsFloat(), "Only makes sense for floating-point"); const Sisd d; return OrAnd(abs, SignBit(d), sign); } // ------------------------------ BroadcastSignBit template HWY_API Vec1 BroadcastSignBit(const Vec1 v) { return Vec1(ScalarShr(v.raw, sizeof(T) * 8 - 1)); } // ------------------------------ PopulationCount #ifdef HWY_NATIVE_POPCNT #undef HWY_NATIVE_POPCNT #else #define HWY_NATIVE_POPCNT #endif template HWY_API Vec1 PopulationCount(Vec1 v) { return Vec1(static_cast(PopCount(v.raw))); } // ------------------------------ IfThenElse // Returns mask ? yes : no. template HWY_API Vec1 IfThenElse(const Mask1 mask, const Vec1 yes, const Vec1 no) { return mask.bits ? yes : no; } template HWY_API Vec1 IfThenElseZero(const Mask1 mask, const Vec1 yes) { return mask.bits ? yes : Vec1(ConvertScalarTo(0)); } template HWY_API Vec1 IfThenZeroElse(const Mask1 mask, const Vec1 no) { return mask.bits ? Vec1(ConvertScalarTo(0)) : no; } template HWY_API Vec1 IfNegativeThenElse(Vec1 v, Vec1 yes, Vec1 no) { const DFromV d; const RebindToSigned di; const auto vi = BitCast(di, v); return vi.raw < 0 ? yes : no; } // ------------------------------ Mask logical template HWY_API Mask1 Not(const Mask1 m) { return MaskFromVec(Not(VecFromMask(Sisd(), m))); } template HWY_API Mask1 And(const Mask1 a, Mask1 b) { const Sisd d; return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b))); } template HWY_API Mask1 AndNot(const Mask1 a, Mask1 b) { const Sisd d; return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b))); } template HWY_API Mask1 Or(const Mask1 a, Mask1 b) { const Sisd d; return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b))); } template HWY_API Mask1 Xor(const Mask1 a, Mask1 b) { const Sisd d; return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b))); } template HWY_API Mask1 ExclusiveNeither(const Mask1 a, Mask1 b) { const Sisd d; return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b)))); } template HWY_API Mask1 SetAtOrAfterFirst(Mask1 mask) { return mask; } template HWY_API Mask1 SetBeforeFirst(Mask1 mask) { return Not(mask); } template HWY_API Mask1 SetOnlyFirst(Mask1 mask) { return mask; } template HWY_API Mask1 SetAtOrBeforeFirst(Mask1 /*mask*/) { return Mask1::FromBool(true); } // ------------------------------ LowerHalfOfMask #ifdef HWY_NATIVE_LOWER_HALF_OF_MASK #undef HWY_NATIVE_LOWER_HALF_OF_MASK #else #define HWY_NATIVE_LOWER_HALF_OF_MASK #endif template HWY_API MFromD LowerHalfOfMask(D /*d*/, MFromD m) { return m; } // ================================================== SHIFTS // ------------------------------ ShiftLeft/ShiftRight (BroadcastSignBit) template HWY_API Vec1 ShiftLeft(const Vec1 v) { static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift"); return Vec1( static_cast(static_cast>(v.raw) << kBits)); } template HWY_API Vec1 ShiftRight(const Vec1 v) { static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift"); return Vec1(ScalarShr(v.raw, kBits)); } // ------------------------------ RotateRight (ShiftRight) template HWY_API Vec1 RotateRight(const Vec1 v) { const DFromV d; const RebindToUnsigned du; constexpr size_t kSizeInBits = sizeof(T) * 8; static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count"); if (kBits == 0) return v; return Or(BitCast(d, ShiftRight(BitCast(du, v))), ShiftLeft(v)); } // ------------------------------ ShiftLeftSame (BroadcastSignBit) template HWY_API Vec1 ShiftLeftSame(const Vec1 v, int bits) { return Vec1( static_cast(static_cast>(v.raw) << bits)); } template HWY_API Vec1 ShiftRightSame(const Vec1 v, int bits) { return Vec1(ScalarShr(v.raw, bits)); } // ------------------------------ Shl // Single-lane => same as ShiftLeftSame except for the argument type. template HWY_API Vec1 operator<<(const Vec1 v, const Vec1 bits) { return ShiftLeftSame(v, static_cast(bits.raw)); } template HWY_API Vec1 operator>>(const Vec1 v, const Vec1 bits) { return ShiftRightSame(v, static_cast(bits.raw)); } // ================================================== ARITHMETIC template HWY_API Vec1 operator+(Vec1 a, Vec1 b) { const uint64_t a64 = static_cast(a.raw); const uint64_t b64 = static_cast(b.raw); return Vec1(static_cast((a64 + b64) & static_cast(~T(0)))); } HWY_API Vec1 operator+(const Vec1 a, const Vec1 b) { return Vec1(a.raw + b.raw); } HWY_API Vec1 operator+(const Vec1 a, const Vec1 b) { return Vec1(a.raw + b.raw); } template HWY_API Vec1 operator-(Vec1 a, Vec1 b) { const uint64_t a64 = static_cast(a.raw); const uint64_t b64 = static_cast(b.raw); return Vec1(static_cast((a64 - b64) & static_cast(~T(0)))); } HWY_API Vec1 operator-(const Vec1 a, const Vec1 b) { return Vec1(a.raw - b.raw); } HWY_API Vec1 operator-(const Vec1 a, const Vec1 b) { return Vec1(a.raw - b.raw); } // ------------------------------ SumsOf8 HWY_API Vec1 SumsOf8(const Vec1 v) { return Vec1(v.raw); } HWY_API Vec1 SumsOf8(const Vec1 v) { return Vec1(v.raw); } // ------------------------------ SumsOf2 template HWY_API Vec1> SumsOf2(const Vec1 v) { const DFromV d; const Rebind, decltype(d)> dw; return PromoteTo(dw, v); } // ------------------------------ SaturatedAdd // Returns a + b clamped to the destination range. // Unsigned HWY_API Vec1 SaturatedAdd(const Vec1 a, const Vec1 b) { return Vec1( static_cast(HWY_MIN(HWY_MAX(0, a.raw + b.raw), 255))); } HWY_API Vec1 SaturatedAdd(const Vec1 a, const Vec1 b) { return Vec1(static_cast( HWY_MIN(HWY_MAX(0, static_cast(a.raw) + b.raw), 65535))); } // Signed HWY_API Vec1 SaturatedAdd(const Vec1 a, const Vec1 b) { return Vec1( static_cast(HWY_MIN(HWY_MAX(-128, a.raw + b.raw), 127))); } HWY_API Vec1 SaturatedAdd(const Vec1 a, const Vec1 b) { return Vec1(static_cast( HWY_MIN(HWY_MAX(-32768, static_cast(a.raw) + b.raw), 32767))); } // ------------------------------ Saturating subtraction // Returns a - b clamped to the destination range. // Unsigned HWY_API Vec1 SaturatedSub(const Vec1 a, const Vec1 b) { return Vec1( static_cast(HWY_MIN(HWY_MAX(0, a.raw - b.raw), 255))); } HWY_API Vec1 SaturatedSub(const Vec1 a, const Vec1 b) { return Vec1(static_cast( HWY_MIN(HWY_MAX(0, static_cast(a.raw) - b.raw), 65535))); } // Signed HWY_API Vec1 SaturatedSub(const Vec1 a, const Vec1 b) { return Vec1( static_cast(HWY_MIN(HWY_MAX(-128, a.raw - b.raw), 127))); } HWY_API Vec1 SaturatedSub(const Vec1 a, const Vec1 b) { return Vec1(static_cast( HWY_MIN(HWY_MAX(-32768, static_cast(a.raw) - b.raw), 32767))); } // ------------------------------ Average // Returns (a + b + 1) / 2 #ifdef HWY_NATIVE_AVERAGE_ROUND_UI32 #undef HWY_NATIVE_AVERAGE_ROUND_UI32 #else #define HWY_NATIVE_AVERAGE_ROUND_UI32 #endif #ifdef HWY_NATIVE_AVERAGE_ROUND_UI64 #undef HWY_NATIVE_AVERAGE_ROUND_UI64 #else #define HWY_NATIVE_AVERAGE_ROUND_UI64 #endif template HWY_API Vec1 AverageRound(const Vec1 a, const Vec1 b) { const T a_val = a.raw; const T b_val = b.raw; return Vec1