/**************************** vectorf256e.h ******************************* * Author: Agner Fog * Date created: 2012-05-30 * Last modified: 2014-10-22 * Version: 1.16 * Project: vector classes * Description: * Header file defining 256-bit floating point vector classes as interface * to intrinsic functions. Emulated for processors without AVX instruction set. * * The following vector classes are defined here: * Vec8f Vector of 8 single precision floating point numbers * Vec8fb Vector of 8 Booleans for use with Vec8f * Vec4d Vector of 4 double precision floating point numbers * Vec4db Vector of 4 Booleans for use with Vec4d * * For detailed instructions, see VectorClass.pdf * * (c) Copyright 2012 - 2014 GNU General Public License http://www.gnu.org/licenses *****************************************************************************/ // check combination of header files #ifdef VECTORF256_H #if VECTORF256_H != 1 #error Two different versions of vectorf256.h included #endif #else #define VECTORF256_H 1 #if defined (VECTORI256_H) && VECTORI256_H >= 2 #error wrong combination of header files. Use vectorf256.h instead of vectorf256e.h if you have AVX2 #endif #include "vectorf128.h" // Define 128-bit vectors /***************************************************************************** * * base class Vec256fe and Vec256de * *****************************************************************************/ // base class to replace __m256 when AVX is not supported class Vec256fe { protected: __m128 y0; // low half __m128 y1; // high half public: Vec256fe(void) {}; // default constructor Vec256fe(__m128 x0, __m128 x1) { // constructor to build from two __m128 y0 = x0; y1 = x1; } __m128 get_low() const { // get low half return y0; } __m128 get_high() const { // get high half return y1; } }; // base class to replace __m256d when AVX is not supported class Vec256de { public: Vec256de() {}; // default constructor Vec256de(__m128d x0, __m128d x1) { // constructor to build from two __m128d y0 = x0; y1 = x1; } __m128d get_low() const { // get low half return y0; } __m128d get_high() const { // get high half return y1; } protected: __m128d y0; // low half __m128d y1; // high half }; /***************************************************************************** * * select functions * *****************************************************************************/ // Select between two Vec256fe sources, element by element. Used in various functions // and operators. Corresponds to this pseudocode: // for (int i = 0; i < 8; i++) result[i] = s[i] ? a[i] : b[i]; // Each element in s must be either 0 (false) or 0xFFFFFFFF (true). static inline Vec256fe selectf (Vec256fe const & s, Vec256fe const & a, Vec256fe const & b) { return Vec256fe(selectf(b.get_low(), a.get_low(), s.get_low()), selectf(b.get_high(), a.get_high(), s.get_high())); } // Same, with two Vec256de sources. // and operators. Corresponds to this pseudocode: // for (int i = 0; i < 4; i++) result[i] = s[i] ? a[i] : b[i]; // Each element in s must be either 0 (false) or 0xFFFFFFFFFFFFFFFF (true). No other // values are allowed. static inline Vec256de selectd (Vec256de const & s, Vec256de const & a, Vec256de const & b) { return Vec256de(selectd(b.get_low(), a.get_low(), s.get_low()), selectd(b.get_high(), a.get_high(), s.get_high())); } /***************************************************************************** * * Generate compile-time constant vector * *****************************************************************************/ // Generate a constant vector of 8 integers stored in memory, // load as __m256 template static inline Vec256fe constant8f() { static const union { int i[8]; __m128 y[2]; } u = {{i0,i1,i2,i3,i4,i5,i6,i7}}; return Vec256fe(u.y[0], u.y[1]); } /***************************************************************************** * * Vec8fb: Vector of 8 Booleans for use with Vec8f * *****************************************************************************/ class Vec8fb : public Vec256fe { public: // Default constructor: Vec8fb() { } // Constructor to build from all elements: Vec8fb(bool b0, bool b1, bool b2, bool b3, bool b4, bool b5, bool b6, bool b7) { y0 = Vec4fb(b0, b1, b2, b3); y1 = Vec4fb(b4, b5, b6, b7); } // Constructor to build from two Vec4fb: Vec8fb(Vec4fb const & a0, Vec4fb const & a1) { y0 = a0; y1 = a1; } // Constructor to convert from type Vec256fe Vec8fb(Vec256fe const & x) { y0 = x.get_low(); y1 = x.get_high(); } // Assignment operator to convert from type Vec256fe Vec8fb & operator = (Vec256fe const & x) { y0 = x.get_low(); y1 = x.get_high(); return *this; } #ifdef VECTORI256_H // 256 bit integer vectors are available // Constructor to convert from type Vec8ib used as Boolean for integer vectors Vec8fb(Vec8ib const & x) { y0 = _mm_castsi128_ps(Vec8i(x).get_low()); y1 = _mm_castsi128_ps(Vec8i(x).get_high()); } // Assignment operator to convert from type Vec8ib used as Boolean for integer vectors Vec8fb & operator = (Vec8ib const & x) { y0 = _mm_castsi128_ps(Vec8i(x).get_low()); y1 = _mm_castsi128_ps(Vec8i(x).get_high()); return *this; } // Constructor to broadcast the same value into all elements: Vec8fb(bool b) { y1 = y0 = Vec4fb(b); } // Assignment operator to broadcast scalar value: Vec8fb & operator = (bool b) { y0 = y1 = Vec4fb(b); return *this; } private: // Prevent constructing from int, etc. Vec8fb(int b); Vec8fb & operator = (int x); public: // Type cast operator to convert to type Vec8ib used as Boolean for integer vectors operator Vec8ib() const { return Vec8i(_mm_castps_si128(y0), _mm_castps_si128(y1)); } #endif // VECTORI256_H // Member function to change a single element in vector // Note: This function is inefficient. Use load function if changing more than one element Vec8fb const & insert(uint32_t index, bool value) { if (index < 4) { y0 = Vec4fb(y0).insert(index, value); } else { y1 = Vec4fb(y1).insert(index-4, value); } return *this; } // Member function extract a single element from vector // Note: This function is inefficient. Use store function if extracting more than one element bool extract(uint32_t index) const { if (index < 4) { return Vec4fb(y0).extract(index); } else { return Vec4fb(y1).extract(index-4); } } // Extract a single element. Operator [] can only read an element, not write. bool operator [] (uint32_t index) const { return extract(index); } // Member functions to split into two Vec4fb: Vec4fb get_low() const { return y0; } Vec4fb get_high() const { return y1; } static int size () { return 8; } }; /***************************************************************************** * * Operators for Vec8fb * *****************************************************************************/ // vector operator & : bitwise and static inline Vec8fb operator & (Vec8fb const & a, Vec8fb const & b) { return Vec8fb(a.get_low() & b.get_low(), a.get_high() & b.get_high()); } static inline Vec8fb operator && (Vec8fb const & a, Vec8fb const & b) { return a & b; } // vector operator &= : bitwise and static inline Vec8fb & operator &= (Vec8fb & a, Vec8fb const & b) { a = a & b; return a; } // vector operator | : bitwise or static inline Vec8fb operator | (Vec8fb const & a, Vec8fb const & b) { return Vec8fb(a.get_low() | b.get_low(), a.get_high() | b.get_high()); } static inline Vec8fb operator || (Vec8fb const & a, Vec8fb const & b) { return a | b; } // vector operator |= : bitwise or static inline Vec8fb & operator |= (Vec8fb & a, Vec8fb const & b) { a = a | b; return a; } // vector operator ^ : bitwise xor static inline Vec8fb operator ^ (Vec8fb const & a, Vec8fb const & b) { return Vec8fb(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high()); } // vector operator ^= : bitwise xor static inline Vec8fb & operator ^= (Vec8fb & a, Vec8fb const & b) { a = a ^ b; return a; } // vector operator ~ : bitwise not static inline Vec8fb operator ~ (Vec8fb const & a) { return Vec8fb(~a.get_low(), ~a.get_high()); } // vector operator ! : logical not // (operator ! is less efficient than operator ~. Use only where not // all bits in an element are the same) static inline Vec8fb operator ! (Vec8fb const & a) { return Vec8fb(!a.get_low(), !a.get_high()); } // Functions for Vec8fb // andnot: a & ~ b static inline Vec8fb andnot(Vec8fb const & a, Vec8fb const & b) { return Vec8fb(andnot(a.get_low(), b.get_low()), andnot(a.get_high(), b.get_high())); } /***************************************************************************** * * Horizontal Boolean functions * *****************************************************************************/ // horizontal_and. Returns true if all bits are 1 static inline bool horizontal_and (Vec8fb const & a) { return horizontal_and(a.get_low() & a.get_high()); } // horizontal_or. Returns true if at least one bit is 1 static inline bool horizontal_or (Vec8fb const & a) { return horizontal_or(a.get_low() | a.get_high()); } /***************************************************************************** * * Vec4db: Vector of 4 Booleans for use with Vec4d * *****************************************************************************/ class Vec4db : public Vec256de { public: // Default constructor: Vec4db() { } // Constructor to build from all elements: Vec4db(bool b0, bool b1, bool b2, bool b3) { y0 = Vec2db(b0, b1); y1 = Vec2db(b2, b3); } // Constructor to build from two Vec2db: Vec4db(Vec2db const & a0, Vec2db const & a1) { y0 = a0; y1 = a1; } // Constructor to convert from type Vec256de Vec4db(Vec256de const & x) { y0 = x.get_low(); y1 = x.get_high(); } // Assignment operator to convert from type Vec256de Vec4db & operator = (Vec256de const & x) { y0 = x.get_low(); y1 = x.get_high(); return *this; } #ifdef VECTORI256_H // 256 bit integer vectors are available // Constructor to convert from type Vec4qb used as Boolean for integer vectors Vec4db(Vec4qb const & x) { y0 = _mm_castsi128_pd(Vec4q(x).get_low()); y1 = _mm_castsi128_pd(Vec4q(x).get_high()); } // Assignment operator to convert from type Vec4qb used as Boolean for integer vectors Vec4db & operator = (Vec4qb const & x) { y0 = _mm_castsi128_pd(Vec4q(x).get_low()); y1 = _mm_castsi128_pd(Vec4q(x).get_high()); return *this; } // Constructor to broadcast the same value into all elements: Vec4db(bool b) { y1 = y0 = Vec2db(b); } // Assignment operator to broadcast scalar value: Vec4db & operator = (bool b) { y0 = y1 = Vec2db(b); return *this; } private: // Prevent constructing from int, etc. Vec4db(int b); Vec4db & operator = (int x); public: // Type cast operator to convert to type Vec4qb used as Boolean for integer vectors operator Vec4qb() const { return Vec4q(_mm_castpd_si128(y0), _mm_castpd_si128(y1)); } #endif // VECTORI256_H // Member function to change a single element in vector // Note: This function is inefficient. Use load function if changing more than one element Vec4db const & insert(uint32_t index, bool value) { if (index < 2) { y0 = Vec2db(y0).insert(index, value); } else { y1 = Vec2db(y1).insert(index - 2, value); } return *this; } // Member function extract a single element from vector // Note: This function is inefficient. Use store function if extracting more than one element bool extract(uint32_t index) const { if (index < 2) { return Vec2db(y0).extract(index); } else { return Vec2db(y1).extract(index - 2); } } // Extract a single element. Operator [] can only read an element, not write. bool operator [] (uint32_t index) const { return extract(index); } // Member functions to split into two Vec4fb: Vec2db get_low() const { return y0; } Vec2db get_high() const { return y1; } static int size () { return 4; } }; /***************************************************************************** * * Operators for Vec4db * *****************************************************************************/ // vector operator & : bitwise and static inline Vec4db operator & (Vec4db const & a, Vec4db const & b) { return Vec4db(a.get_low() & b.get_low(), a.get_high() & b.get_high()); } static inline Vec4db operator && (Vec4db const & a, Vec4db const & b) { return a & b; } // vector operator &= : bitwise and static inline Vec4db & operator &= (Vec4db & a, Vec4db const & b) { a = a & b; return a; } // vector operator | : bitwise or static inline Vec4db operator | (Vec4db const & a, Vec4db const & b) { return Vec4db(a.get_low() | b.get_low(), a.get_high() | b.get_high()); } static inline Vec4db operator || (Vec4db const & a, Vec4db const & b) { return a | b; } // vector operator |= : bitwise or static inline Vec4db & operator |= (Vec4db & a, Vec4db const & b) { a = a | b; return a; } // vector operator ^ : bitwise xor static inline Vec4db operator ^ (Vec4db const & a, Vec4db const & b) { return Vec4db(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high()); } // vector operator ^= : bitwise xor static inline Vec4db & operator ^= (Vec4db & a, Vec4db const & b) { a = a ^ b; return a; } // vector operator ~ : bitwise not static inline Vec4db operator ~ (Vec4db const & a) { return Vec4db(~a.get_low(), ~a.get_high()); } // vector operator ! : logical not // (operator ! is less efficient than operator ~. Use only where not // all bits in an element are the same) static inline Vec4db operator ! (Vec4db const & a) { return Vec4db(!a.get_low(), !a.get_high()); } // Functions for Vec4db // andnot: a & ~ b static inline Vec4db andnot(Vec4db const & a, Vec4db const & b) { return Vec4db(andnot(a.get_low(), b.get_low()), andnot(a.get_high(), b.get_high())); } /***************************************************************************** * * Horizontal Boolean functions * *****************************************************************************/ // horizontal_and. Returns true if all bits are 1 static inline bool horizontal_and (Vec4db const & a) { return horizontal_and(a.get_low() & a.get_high()); } // horizontal_or. Returns true if at least one bit is 1 static inline bool horizontal_or (Vec4db const & a) { return horizontal_or(a.get_low() | a.get_high()); } /***************************************************************************** * * Vec8f: Vector of 8 single precision floating point values * *****************************************************************************/ class Vec8f : public Vec256fe { public: // Default constructor: Vec8f() { } // Constructor to broadcast the same value into all elements: Vec8f(float f) { y1 = y0 = _mm_set1_ps(f); } // Constructor to build from all elements: Vec8f(float f0, float f1, float f2, float f3, float f4, float f5, float f6, float f7) { y0 = _mm_setr_ps(f0, f1, f2, f3); y1 = _mm_setr_ps(f4, f5, f6, f7); } // Constructor to build from two Vec4f: Vec8f(Vec4f const & a0, Vec4f const & a1) { y0 = a0; y1 = a1; } // Constructor to convert from type Vec256fe Vec8f(Vec256fe const & x) { y0 = x.get_low(); y1 = x.get_high(); } // Assignment operator to convert from type Vec256fe Vec8f & operator = (Vec256fe const & x) { y0 = x.get_low(); y1 = x.get_high(); return *this; } // Member function to load from array (unaligned) Vec8f & load(float const * p) { y0 = _mm_loadu_ps(p); y1 = _mm_loadu_ps(p+4); return *this; } // Member function to load from array, aligned by 32 // You may use load_a instead of load if you are certain that p points to an address // divisible by 32. Vec8f & load_a(float const * p) { y0 = _mm_load_ps(p); y1 = _mm_load_ps(p+4); return *this; } // Member function to store into array (unaligned) void store(float * p) const { _mm_storeu_ps(p, y0); _mm_storeu_ps(p+4, y1); } // Member function to store into array, aligned by 32 // You may use store_a instead of store if you are certain that p points to an address // divisible by 32. void store_a(float * p) const { _mm_store_ps(p, y0); _mm_store_ps(p+4, y1); } // Partial load. Load n elements and set the rest to 0 Vec8f & load_partial(int n, float const * p) { if (n > 0 && n <= 4) { *this = Vec8f(Vec4f().load_partial(n, p),_mm_setzero_ps()); } else if (n > 4 && n <= 8) { *this = Vec8f(Vec4f().load(p), Vec4f().load_partial(n - 4, p + 4)); } else { y1 = y0 = _mm_setzero_ps(); } return *this; } // Partial store. Store n elements void store_partial(int n, float * p) const { if (n <= 4) { get_low().store_partial(n, p); } else if (n <= 8) { get_low().store(p); get_high().store_partial(n - 4, p + 4); } } // cut off vector to n elements. The last 8-n elements are set to zero Vec8f & cutoff(int n) { if (uint32_t(n) >= 8) return *this; else if (n >= 4) { y1 = Vec4f(y1).cutoff(n - 4); } else { y0 = Vec4f(y0).cutoff(n); y1 = Vec4f(0.0f); } return *this; } // Member function to change a single element in vector // Note: This function is inefficient. Use load function if changing more than one element Vec8f const & insert(uint32_t index, float value) { if (index < 4) { y0 = Vec4f(y0).insert(index, value); } else { y1 = Vec4f(y1).insert(index - 4, value); } return *this; } // Member function extract a single element from vector // Note: This function is inefficient. Use store function if extracting more than one element float extract(uint32_t index) const { if (index < 4) { return Vec4f(y0).extract(index); } else { return Vec4f(y1).extract(index - 4); } } // Extract a single element. Use store function if extracting more than one element. // Operator [] can only read an element, not write. float operator [] (uint32_t index) const { return extract(index); } // Member functions to split into two Vec4f: Vec4f get_low() const { return y0; } Vec4f get_high() const { return y1; } static int size () { return 8; } }; /***************************************************************************** * * Operators for Vec8f * *****************************************************************************/ // vector operator + : add element by element static inline Vec8f operator + (Vec8f const & a, Vec8f const & b) { return Vec8f(a.get_low() + b.get_low(), a.get_high() + b.get_high()); } // vector operator + : add vector and scalar static inline Vec8f operator + (Vec8f const & a, float b) { return a + Vec8f(b); } static inline Vec8f operator + (float a, Vec8f const & b) { return Vec8f(a) + b; } // vector operator += : add static inline Vec8f & operator += (Vec8f & a, Vec8f const & b) { a = a + b; return a; } // postfix operator ++ static inline Vec8f operator ++ (Vec8f & a, int) { Vec8f a0 = a; a = a + 1.0f; return a0; } // prefix operator ++ static inline Vec8f & operator ++ (Vec8f & a) { a = a + 1.0f; return a; } // vector operator - : subtract element by element static inline Vec8f operator - (Vec8f const & a, Vec8f const & b) { return Vec8f(a.get_low() - b.get_low(), a.get_high() - b.get_high()); } // vector operator - : subtract vector and scalar static inline Vec8f operator - (Vec8f const & a, float b) { return a - Vec8f(b); } static inline Vec8f operator - (float a, Vec8f const & b) { return Vec8f(a) - b; } // vector operator - : unary minus // Change sign bit, even for 0, INF and NAN static inline Vec8f operator - (Vec8f const & a) { return Vec8f(-a.get_low(), -a.get_high()); } // vector operator -= : subtract static inline Vec8f & operator -= (Vec8f & a, Vec8f const & b) { a = a - b; return a; } // postfix operator -- static inline Vec8f operator -- (Vec8f & a, int) { Vec8f a0 = a; a = a - 1.0f; return a0; } // prefix operator -- static inline Vec8f & operator -- (Vec8f & a) { a = a - 1.0f; return a; } // vector operator * : multiply element by element static inline Vec8f operator * (Vec8f const & a, Vec8f const & b) { return Vec8f(a.get_low() * b.get_low(), a.get_high() * b.get_high()); } // vector operator * : multiply vector and scalar static inline Vec8f operator * (Vec8f const & a, float b) { return a * Vec8f(b); } static inline Vec8f operator * (float a, Vec8f const & b) { return Vec8f(a) * b; } // vector operator *= : multiply static inline Vec8f & operator *= (Vec8f & a, Vec8f const & b) { a = a * b; return a; } // vector operator / : divide all elements by same integer static inline Vec8f operator / (Vec8f const & a, Vec8f const & b) { return Vec8f(a.get_low() / b.get_low(), a.get_high() / b.get_high()); } // vector operator / : divide vector and scalar static inline Vec8f operator / (Vec8f const & a, float b) { return a / Vec8f(b); } static inline Vec8f operator / (float a, Vec8f const & b) { return Vec8f(a) / b; } // vector operator /= : divide static inline Vec8f & operator /= (Vec8f & a, Vec8f const & b) { a = a / b; return a; } // vector operator == : returns true for elements for which a == b static inline Vec8fb operator == (Vec8f const & a, Vec8f const & b) { return Vec8fb(a.get_low() == b.get_low(), a.get_high() == b.get_high()); } // vector operator != : returns true for elements for which a != b static inline Vec8fb operator != (Vec8f const & a, Vec8f const & b) { return Vec8fb(a.get_low() != b.get_low(), a.get_high() != b.get_high()); } // vector operator < : returns true for elements for which a < b static inline Vec8fb operator < (Vec8f const & a, Vec8f const & b) { return Vec8fb(a.get_low() < b.get_low(), a.get_high() < b.get_high()); } // vector operator <= : returns true for elements for which a <= b static inline Vec8fb operator <= (Vec8f const & a, Vec8f const & b) { return Vec8fb(a.get_low() <= b.get_low(), a.get_high() <= b.get_high()); } // vector operator > : returns true for elements for which a > b static inline Vec8fb operator > (Vec8f const & a, Vec8f const & b) { return Vec8fb(a.get_low() > b.get_low(), a.get_high() > b.get_high()); } // vector operator >= : returns true for elements for which a >= b static inline Vec8fb operator >= (Vec8f const & a, Vec8f const & b) { return Vec8fb(a.get_low() >= b.get_low(), a.get_high() >= b.get_high()); } // Bitwise logical operators // vector operator & : bitwise and static inline Vec8f operator & (Vec8f const & a, Vec8f const & b) { return Vec8f(a.get_low() & b.get_low(), a.get_high() & b.get_high()); } // vector operator &= : bitwise and static inline Vec8f & operator &= (Vec8f & a, Vec8f const & b) { a = a & b; return a; } // vector operator & : bitwise and of Vec8f and Vec8fb static inline Vec8f operator & (Vec8f const & a, Vec8fb const & b) { return Vec8f(a.get_low() & b.get_low(), a.get_high() & b.get_high()); } static inline Vec8f operator & (Vec8fb const & a, Vec8f const & b) { return Vec8f(a.get_low() & b.get_low(), a.get_high() & b.get_high()); } // vector operator | : bitwise or static inline Vec8f operator | (Vec8f const & a, Vec8f const & b) { return Vec8f(a.get_low() | b.get_low(), a.get_high() | b.get_high()); } // vector operator |= : bitwise or static inline Vec8f & operator |= (Vec8f & a, Vec8f const & b) { a = a | b; return a; } // vector operator ^ : bitwise xor static inline Vec8f operator ^ (Vec8f const & a, Vec8f const & b) { return Vec8f(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high()); } // vector operator ^= : bitwise xor static inline Vec8f & operator ^= (Vec8f & a, Vec8f const & b) { a = a ^ b; return a; } // vector operator ! : logical not. Returns Boolean vector static inline Vec8fb operator ! (Vec8f const & a) { return Vec8fb(!a.get_low(), !a.get_high()); } /***************************************************************************** * * Functions for Vec8f * *****************************************************************************/ // Select between two operands. Corresponds to this pseudocode: // for (int i = 0; i < 8; i++) result[i] = s[i] ? a[i] : b[i]; // Each byte in s must be either 0 (false) or 0xFFFFFFFF (true). No other values are allowed. static inline Vec8f select (Vec8fb const & s, Vec8f const & a, Vec8f const & b) { return Vec8f(select(s.get_low(),a.get_low(),b.get_low()), select(s.get_high(),a.get_high(),b.get_high())); } // Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i] static inline Vec8f if_add (Vec8fb const & f, Vec8f const & a, Vec8f const & b) { return a + (Vec8f(f) & b); } // Conditional multiply: For all vector elements i: result[i] = f[i] ? (a[i] * b[i]) : a[i] static inline Vec8f if_mul (Vec8fb const & f, Vec8f const & a, Vec8f const & b) { return a * select(f, b, 1.f); } // General arithmetic functions, etc. // Horizontal add: Calculates the sum of all vector elements. static inline float horizontal_add (Vec8f const & a) { return horizontal_add(a.get_low() + a.get_high()); } // function max: a > b ? a : b static inline Vec8f max(Vec8f const & a, Vec8f const & b) { return Vec8f(max(a.get_low(),b.get_low()), max(a.get_high(),b.get_high())); } // function min: a < b ? a : b static inline Vec8f min(Vec8f const & a, Vec8f const & b) { return Vec8f(min(a.get_low(),b.get_low()), min(a.get_high(),b.get_high())); } // function abs: absolute value // Removes sign bit, even for -0.0f, -INF and -NAN static inline Vec8f abs(Vec8f const & a) { return Vec8f(abs(a.get_low()), abs(a.get_high())); } // function sqrt: square root static inline Vec8f sqrt(Vec8f const & a) { return Vec8f(sqrt(a.get_low()), sqrt(a.get_high())); } // function square: a * a static inline Vec8f square(Vec8f const & a) { return Vec8f(square(a.get_low()), square(a.get_high())); } // pow(Vec8f, int): template static Vec8f pow(Vec8f const & a, TT n); // Raise floating point numbers to integer power n template <> inline Vec8f pow(Vec8f const & x0, int n) { return pow_template_i(x0, n); } // allow conversion from unsigned int template <> inline Vec8f pow(Vec8f const & x0, uint32_t n) { return pow_template_i(x0, (int)n); } // Raise floating point numbers to integer power n, where n is a compile-time constant template static inline Vec8f pow_n(Vec8f const & a) { return Vec8f(pow_n(a.get_low()), pow_n(a.get_high())); } template static inline Vec8f pow(Vec8f const & a, Const_int_t) { return pow_n(a); } // function round: round to nearest integer (even). (result as float vector) static inline Vec8f round(Vec8f const & a) { return Vec8f(round(a.get_low()), round(a.get_high())); } // function truncate: round towards zero. (result as float vector) static inline Vec8f truncate(Vec8f const & a) { return Vec8f(truncate(a.get_low()), truncate(a.get_high())); } // function floor: round towards minus infinity. (result as float vector) static inline Vec8f floor(Vec8f const & a) { return Vec8f(floor(a.get_low()), floor(a.get_high())); } // function ceil: round towards plus infinity. (result as float vector) static inline Vec8f ceil(Vec8f const & a) { return Vec8f(ceil(a.get_low()), ceil(a.get_high())); } #ifdef VECTORI256_H // 256 bit integer vectors are available // function round_to_int: round to nearest integer (even). (result as integer vector) static inline Vec8i round_to_int(Vec8f const & a) { // Note: assume MXCSR control register is set to rounding return Vec8i(round_to_int(a.get_low()), round_to_int(a.get_high())); } // function truncate_to_int: round towards zero. (result as integer vector) static inline Vec8i truncate_to_int(Vec8f const & a) { return Vec8i(truncate_to_int(a.get_low()), truncate_to_int(a.get_high())); } // function to_float: convert integer vector to float vector static inline Vec8f to_float(Vec8i const & a) { return Vec8f(to_float(a.get_low()), to_float(a.get_high())); } #endif // VECTORI256_H // Approximate math functions // approximate reciprocal (Faster than 1.f / a. relative accuracy better than 2^-11) static inline Vec8f approx_recipr(Vec8f const & a) { return Vec8f(approx_recipr(a.get_low()), approx_recipr(a.get_high())); } // approximate reciprocal squareroot (Faster than 1.f / sqrt(a). Relative accuracy better than 2^-11) static inline Vec8f approx_rsqrt(Vec8f const & a) { return Vec8f(approx_rsqrt(a.get_low()), approx_rsqrt(a.get_high())); } // Fused multiply and add functions // Multiply and add static inline Vec8f mul_add(Vec8f const & a, Vec8f const & b, Vec8f const & c) { return Vec8f(mul_add(a.get_low(),b.get_low(),c.get_low()), mul_add(a.get_high(),b.get_high(),c.get_high())); } // Multiply and subtract static inline Vec8f mul_sub(Vec8f const & a, Vec8f const & b, Vec8f const & c) { return Vec8f(mul_sub(a.get_low(),b.get_low(),c.get_low()), mul_sub(a.get_high(),b.get_high(),c.get_high())); } // Multiply and inverse subtract static inline Vec8f nmul_add(Vec8f const & a, Vec8f const & b, Vec8f const & c) { return Vec8f(nmul_add(a.get_low(),b.get_low(),c.get_low()), nmul_add(a.get_high(),b.get_high(),c.get_high())); } // Multiply and subtract with extra precision on the intermediate calculations, // even if FMA instructions not supported, using Veltkamp-Dekker split static inline Vec8f mul_sub_x(Vec8f const & a, Vec8f const & b, Vec8f const & c) { return Vec8f(mul_sub_x(a.get_low(),b.get_low(),c.get_low()), mul_sub_x(a.get_high(),b.get_high(),c.get_high())); } // Math functions using fast bit manipulation #ifdef VECTORI256_H // 256 bit integer vectors are available // Extract the exponent as an integer // exponent(a) = floor(log2(abs(a))); // exponent(1.0f) = 0, exponent(0.0f) = -127, exponent(INF) = +128, exponent(NAN) = +128 static inline Vec8i exponent(Vec8f const & a) { return Vec8i(exponent(a.get_low()), exponent(a.get_high())); } #endif // Extract the fraction part of a floating point number // a = 2^exponent(a) * fraction(a), except for a = 0 // fraction(1.0f) = 1.0f, fraction(5.0f) = 1.25f static inline Vec8f fraction(Vec8f const & a) { return Vec8f(fraction(a.get_low()), fraction(a.get_high())); } #ifdef VECTORI256_H // 256 bit integer vectors are available // Fast calculation of pow(2,n) with n integer // n = 0 gives 1.0f // n >= 128 gives +INF // n <= -127 gives 0.0f // This function will never produce denormals, and never raise exceptions static inline Vec8f exp2(Vec8i const & a) { return Vec8f(exp2(a.get_low()), exp2(a.get_high())); } //static Vec8f exp2(Vec8f const & x); // defined in vectormath_exp.h #endif // VECTORI256_H // Categorization functions // Function sign_bit: gives true for elements that have the sign bit set // even for -0.0f, -INF and -NAN // Note that sign_bit(Vec8f(-0.0f)) gives true, while Vec8f(-0.0f) < Vec8f(0.0f) gives false // (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h) static inline Vec8fb sign_bit(Vec8f const & a) { return Vec8fb(sign_bit(a.get_low()), sign_bit(a.get_high())); } // Function sign_combine: changes the sign of a when b has the sign bit set // same as select(sign_bit(b), -a, a) static inline Vec8f sign_combine(Vec8f const & a, Vec8f const & b) { return Vec8f(sign_combine(a.get_low(), b.get_low()), sign_combine(a.get_high(), b.get_high())); } // Function is_finite: gives true for elements that are normal, denormal or zero, // false for INF and NAN // (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h) static inline Vec8fb is_finite(Vec8f const & a) { return Vec8fb(is_finite(a.get_low()), is_finite(a.get_high())); } // Function is_inf: gives true for elements that are +INF or -INF // false for finite numbers and NAN // (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h) static inline Vec8fb is_inf(Vec8f const & a) { return Vec8fb(is_inf(a.get_low()), is_inf(a.get_high())); } // Function is_nan: gives true for elements that are +NAN or -NAN // false for finite numbers and +/-INF // (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h) static inline Vec8fb is_nan(Vec8f const & a) { return Vec8fb(is_nan(a.get_low()), is_nan(a.get_high())); } // Function is_subnormal: gives true for elements that are denormal (subnormal) // false for finite numbers, zero, NAN and INF static inline Vec8fb is_subnormal(Vec8f const & a) { return Vec8fb(is_subnormal(a.get_low()), is_subnormal(a.get_high())); } // Function is_zero_or_subnormal: gives true for elements that are zero or subnormal (denormal) // false for finite numbers, NAN and INF static inline Vec8fb is_zero_or_subnormal(Vec8f const & a) { return Vec8fb(is_zero_or_subnormal(a.get_low()), is_zero_or_subnormal(a.get_high())); } // Function infinite4f: returns a vector where all elements are +INF static inline Vec8f infinite8f() { return constant8f<0x7F800000,0x7F800000,0x7F800000,0x7F800000,0x7F800000,0x7F800000,0x7F800000,0x7F800000>(); } // Function nan4f: returns a vector where all elements are +NAN (quiet) static inline Vec8f nan8f(int n = 0x10) { return Vec8f(nan4f(n), nan4f(n)); } // change signs on vectors Vec8f // Each index i0 - i7 is 1 for changing sign on the corresponding element, 0 for no change template static inline Vec8f change_sign(Vec8f const & a) { if ((i0 | i1 | i2 | i3 | i4 | i5 | i6 | i7) == 0) return a; Vec4f lo = change_sign(a.get_low()); Vec4f hi = change_sign(a.get_high()); return Vec8f(lo, hi); } /***************************************************************************** * * Vec2d: Vector of 2 double precision floating point values * *****************************************************************************/ class Vec4d : public Vec256de { public: // Default constructor: Vec4d() { } // Constructor to broadcast the same value into all elements: Vec4d(double d) { y1 = y0 = _mm_set1_pd(d); } // Constructor to build from all elements: Vec4d(double d0, double d1, double d2, double d3) { y0 = _mm_setr_pd(d0, d1); y1 = _mm_setr_pd(d2, d3); } // Constructor to build from two Vec4f: Vec4d(Vec2d const & a0, Vec2d const & a1) { y0 = a0; y1 = a1; } // Constructor to convert from type Vec256de Vec4d(Vec256de const & x) { y0 = x.get_low(); y1 = x.get_high(); } // Assignment operator to convert from type Vec256de Vec4d & operator = (Vec256de const & x) { y0 = x.get_low(); y1 = x.get_high(); return *this; } // Member function to load from array (unaligned) Vec4d & load(double const * p) { y0 = _mm_loadu_pd(p); y1 = _mm_loadu_pd(p+2); return *this; } // Member function to load from array, aligned by 32 // You may use load_a instead of load if you are certain that p points to an address // divisible by 32 Vec4d & load_a(double const * p) { y0 = _mm_load_pd(p); y1 = _mm_load_pd(p+2); return *this; } // Member function to store into array (unaligned) void store(double * p) const { _mm_storeu_pd(p, y0); _mm_storeu_pd(p+2, y1); } // Member function to store into array, aligned by 32 // You may use store_a instead of store if you are certain that p points to an address // divisible by 32 void store_a(double * p) const { _mm_store_pd(p, y0); _mm_store_pd(p+2, y1); } // Partial load. Load n elements and set the rest to 0 Vec4d & load_partial(int n, double const * p) { if (n > 0 && n <= 2) { *this = Vec4d(Vec2d().load_partial(n, p), _mm_setzero_pd()); } else if (n > 2 && n <= 4) { *this = Vec4d(Vec2d().load(p), Vec2d().load_partial(n - 2, p + 2)); } else { y1 = y0 = _mm_setzero_pd(); } return *this; } // Partial store. Store n elements void store_partial(int n, double * p) const { if (n <= 2) { get_low().store_partial(n, p); } else if (n <= 4) { get_low().store(p); get_high().store_partial(n - 2, p + 2); } } Vec4d & cutoff(int n) { if (uint32_t(n) >= 4) return *this; else if (n >= 2) { y1 = Vec2d(y1).cutoff(n - 2); } else { y0 = Vec2d(y0).cutoff(n); y1 = Vec2d(0.0); } return *this; } // Member function to change a single element in vector // Note: This function is inefficient. Use load function if changing more than one element Vec4d const & insert(uint32_t index, double value) { if (index < 2) { y0 = Vec2d(y0).insert(index, value); } else { y1 = Vec2d(y1).insert(index-2, value); } return *this; } // Member function extract a single element from vector // Note: This function is inefficient. Use store function if extracting more than one element double extract(uint32_t index) const { if (index < 2) { return Vec2d(y0).extract(index); } else { return Vec2d(y1).extract(index-2); } } // Extract a single element. Use store function if extracting more than one element. // Operator [] can only read an element, not write. double operator [] (uint32_t index) const { return extract(index); } // Member functions to split into two Vec2d: Vec2d get_low() const { return y0; } Vec2d get_high() const { return y1; } static int size () { return 2; } }; /***************************************************************************** * * Operators for Vec4d * *****************************************************************************/ // vector operator + : add element by element static inline Vec4d operator + (Vec4d const & a, Vec4d const & b) { return Vec4d(a.get_low() + b.get_low(), a.get_high() + b.get_high()); } // vector operator + : add vector and scalar static inline Vec4d operator + (Vec4d const & a, double b) { return a + Vec4d(b); } static inline Vec4d operator + (double a, Vec4d const & b) { return Vec4d(a) + b; } // vector operator += : add static inline Vec4d & operator += (Vec4d & a, Vec4d const & b) { a = a + b; return a; } // postfix operator ++ static inline Vec4d operator ++ (Vec4d & a, int) { Vec4d a0 = a; a = a + 1.0; return a0; } // prefix operator ++ static inline Vec4d & operator ++ (Vec4d & a) { a = a + 1.0; return a; } // vector operator - : subtract element by element static inline Vec4d operator - (Vec4d const & a, Vec4d const & b) { return Vec4d(a.get_low() - b.get_low(), a.get_high() - b.get_high()); } // vector operator - : subtract vector and scalar static inline Vec4d operator - (Vec4d const & a, double b) { return a - Vec4d(b); } static inline Vec4d operator - (double a, Vec4d const & b) { return Vec4d(a) - b; } // vector operator - : unary minus // Change sign bit, even for 0, INF and NAN static inline Vec4d operator - (Vec4d const & a) { return Vec4d(-a.get_low(), -a.get_high()); } // vector operator -= : subtract static inline Vec4d & operator -= (Vec4d & a, Vec4d const & b) { a = a - b; return a; } // postfix operator -- static inline Vec4d operator -- (Vec4d & a, int) { Vec4d a0 = a; a = a - 1.0; return a0; } // prefix operator -- static inline Vec4d & operator -- (Vec4d & a) { a = a - 1.0; return a; } // vector operator * : multiply element by element static inline Vec4d operator * (Vec4d const & a, Vec4d const & b) { return Vec4d(a.get_low() * b.get_low(), a.get_high() * b.get_high()); } // vector operator * : multiply vector and scalar static inline Vec4d operator * (Vec4d const & a, double b) { return a * Vec4d(b); } static inline Vec4d operator * (double a, Vec4d const & b) { return Vec4d(a) * b; } // vector operator *= : multiply static inline Vec4d & operator *= (Vec4d & a, Vec4d const & b) { a = a * b; return a; } // vector operator / : divide all elements by same integer static inline Vec4d operator / (Vec4d const & a, Vec4d const & b) { return Vec4d(a.get_low() / b.get_low(), a.get_high() / b.get_high()); } // vector operator / : divide vector and scalar static inline Vec4d operator / (Vec4d const & a, double b) { return a / Vec4d(b); } static inline Vec4d operator / (double a, Vec4d const & b) { return Vec4d(a) / b; } // vector operator /= : divide static inline Vec4d & operator /= (Vec4d & a, Vec4d const & b) { a = a / b; return a; } // vector operator == : returns true for elements for which a == b static inline Vec4db operator == (Vec4d const & a, Vec4d const & b) { return Vec4db(a.get_low() == b.get_low(), a.get_high() == b.get_high()); } // vector operator != : returns true for elements for which a != b static inline Vec4db operator != (Vec4d const & a, Vec4d const & b) { return Vec4db(a.get_low() != b.get_low(), a.get_high() != b.get_high()); } // vector operator < : returns true for elements for which a < b static inline Vec4db operator < (Vec4d const & a, Vec4d const & b) { return Vec4db(a.get_low() < b.get_low(), a.get_high() < b.get_high()); } // vector operator <= : returns true for elements for which a <= b static inline Vec4db operator <= (Vec4d const & a, Vec4d const & b) { return Vec4db(a.get_low() <= b.get_low(), a.get_high() <= b.get_high()); } // vector operator > : returns true for elements for which a > b static inline Vec4db operator > (Vec4d const & a, Vec4d const & b) { return Vec4db(a.get_low() > b.get_low(), a.get_high() > b.get_high()); } // vector operator >= : returns true for elements for which a >= b static inline Vec4db operator >= (Vec4d const & a, Vec4d const & b) { return Vec4db(a.get_low() >= b.get_low(), a.get_high() >= b.get_high()); } // Bitwise logical operators // vector operator & : bitwise and static inline Vec4d operator & (Vec4d const & a, Vec4d const & b) { return Vec4d(a.get_low() & b.get_low(), a.get_high() & b.get_high()); } // vector operator &= : bitwise and static inline Vec4d & operator &= (Vec4d & a, Vec4d const & b) { a = a & b; return a; } // vector operator & : bitwise and of Vec4d and Vec4db static inline Vec4d operator & (Vec4d const & a, Vec4db const & b) { return Vec4d(a.get_low() & b.get_low(), a.get_high() & b.get_high()); } static inline Vec4d operator & (Vec4db const & a, Vec4d const & b) { return Vec4d(a.get_low() & b.get_low(), a.get_high() & b.get_high()); } // vector operator | : bitwise or static inline Vec4d operator | (Vec4d const & a, Vec4d const & b) { return Vec4d(a.get_low() | b.get_low(), a.get_high() | b.get_high()); } // vector operator |= : bitwise or static inline Vec4d & operator |= (Vec4d & a, Vec4d const & b) { a = a | b; return a; } // vector operator ^ : bitwise xor static inline Vec4d operator ^ (Vec4d const & a, Vec4d const & b) { return Vec4d(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high()); } // vector operator ^= : bitwise xor static inline Vec4d & operator ^= (Vec4d & a, Vec4d const & b) { a = a ^ b; return a; } // vector operator ! : logical not. Returns Boolean vector static inline Vec4db operator ! (Vec4d const & a) { return Vec4db(!a.get_low(), !a.get_high()); } /***************************************************************************** * * Functions for Vec4d * *****************************************************************************/ // Select between two operands. Corresponds to this pseudocode: // for (int i = 0; i < 2; i++) result[i] = s[i] ? a[i] : b[i]; // Each byte in s must be either 0 (false) or 0xFFFFFFFFFFFFFFFF (true). // No other values are allowed. static inline Vec4d select (Vec4db const & s, Vec4d const & a, Vec4d const & b) { return Vec4d(select(s.get_low(), a.get_low(), b.get_low()), select(s.get_high(), a.get_high(), b.get_high())); } // Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i] static inline Vec4d if_add (Vec4db const & f, Vec4d const & a, Vec4d const & b) { return a + (Vec4d(f) & b); } // Conditional multiply: For all vector elements i: result[i] = f[i] ? (a[i] * b[i]) : a[i] static inline Vec4d if_mul (Vec4db const & f, Vec4d const & a, Vec4d const & b) { return a * select(f, b, 1.f); } // General arithmetic functions, etc. // Horizontal add: Calculates the sum of all vector elements. static inline double horizontal_add (Vec4d const & a) { return horizontal_add(a.get_low() + a.get_high()); } // function max: a > b ? a : b static inline Vec4d max(Vec4d const & a, Vec4d const & b) { return Vec4d(max(a.get_low(),b.get_low()), max(a.get_high(),b.get_high())); } // function min: a < b ? a : b static inline Vec4d min(Vec4d const & a, Vec4d const & b) { return Vec4d(min(a.get_low(),b.get_low()), min(a.get_high(),b.get_high())); } // function abs: absolute value // Removes sign bit, even for -0.0f, -INF and -NAN static inline Vec4d abs(Vec4d const & a) { return Vec4d(abs(a.get_low()), abs(a.get_high())); } // function sqrt: square root static inline Vec4d sqrt(Vec4d const & a) { return Vec4d(sqrt(a.get_low()), sqrt(a.get_high())); } // function square: a * a static inline Vec4d square(Vec4d const & a) { return Vec4d(square(a.get_low()), square(a.get_high())); } // pow(Vec4d, int): // Raise floating point numbers to integer power n template static Vec4d pow(Vec4d const & a, TT n); // Raise floating point numbers to integer power n template <> inline Vec4d pow(Vec4d const & x0, int n) { return pow_template_i(x0, n); } // allow conversion from unsigned int template <> inline Vec4d pow(Vec4d const & x0, uint32_t n) { return pow_template_i(x0, (int)n); } // Raise floating point numbers to integer power n, where n is a compile-time constant template static inline Vec4d pow_n(Vec4d const & a) { return Vec4d(pow_n(a.get_low()), pow_n(a.get_high())); } template static inline Vec4d pow(Vec4d const & a, Const_int_t) { return pow_n(a); } // function round: round to nearest integer (even). (result as double vector) static inline Vec4d round(Vec4d const & a) { return Vec4d(round(a.get_low()), round(a.get_high())); } // function truncate: round towards zero. (result as double vector) static inline Vec4d truncate(Vec4d const & a) { return Vec4d(truncate(a.get_low()), truncate(a.get_high())); } // function floor: round towards minus infinity. (result as double vector) static inline Vec4d floor(Vec4d const & a) { return Vec4d(floor(a.get_low()), floor(a.get_high())); } // function ceil: round towards plus infinity. (result as double vector) static inline Vec4d ceil(Vec4d const & a) { return Vec4d(ceil(a.get_low()), ceil(a.get_high())); } // function round_to_int: round to nearest integer (even). (result as integer vector) static inline Vec4i round_to_int(Vec4d const & a) { // Note: assume MXCSR control register is set to rounding return round_to_int(a.get_low(), a.get_high()); } // function truncate_to_int: round towards zero. (result as integer vector) static inline Vec4i truncate_to_int(Vec4d const & a) { return truncate_to_int(a.get_low(), a.get_high()); } #ifdef VECTORI256_H // 256 bit integer vectors are available // function truncate_to_int64: round towards zero. (inefficient) static inline Vec4q truncate_to_int64(Vec4d const & a) { double aa[4]; a.store(aa); return Vec4q(int64_t(aa[0]), int64_t(aa[1]), int64_t(aa[2]), int64_t(aa[3])); } // function truncate_to_int64_limited: round towards zero. // result as 64-bit integer vector, but with limited range static inline Vec4q truncate_to_int64_limited(Vec4d const & a) { return Vec4q(truncate_to_int64_limited(a.get_low()), truncate_to_int64_limited(a.get_high())); } // function round_to_int64: round to nearest or even. (inefficient) static inline Vec4q round_to_int64(Vec4d const & a) { return truncate_to_int64(round(a)); } // function round_to_int64_limited: round to nearest integer // result as 64-bit integer vector, but with limited range static inline Vec4q round_to_int64_limited(Vec4d const & a) { return Vec4q(round_to_int64_limited(a.get_low()), round_to_int64_limited(a.get_high())); } // function to_double: convert integer vector elements to double vector (inefficient) static inline Vec4d to_double(Vec4q const & a) { int64_t aa[4]; a.store(aa); return Vec4d(double(aa[0]), double(aa[1]), double(aa[2]), double(aa[3])); } // function to_double_limited: convert integer vector elements to double vector // limited to abs(x) < 2^31 static inline Vec4d to_double_limited(Vec4q const & x) { return Vec4d(to_double_limited(x.get_low()),to_double_limited(x.get_high())); } #endif // VECTORI256_H // function to_double: convert integer vector to double vector static inline Vec4d to_double(Vec4i const & a) { return Vec4d(to_double_low(a), to_double_high(a)); } // function compress: convert two Vec4d to one Vec8f static inline Vec8f compress (Vec4d const & low, Vec4d const & high) { return Vec8f(compress(low.get_low(), low.get_high()), compress(high.get_low(), high.get_high())); } // Function extend_low : convert Vec8f vector elements 0 - 3 to Vec4d static inline Vec4d extend_low (Vec8f const & a) { return Vec4d(extend_low(a.get_low()), extend_high(a.get_low())); } // Function extend_high : convert Vec8f vector elements 4 - 7 to Vec4d static inline Vec4d extend_high (Vec8f const & a) { return Vec4d(extend_low(a.get_high()), extend_high(a.get_high())); } // Fused multiply and add functions // Multiply and add static inline Vec4d mul_add(Vec4d const & a, Vec4d const & b, Vec4d const & c) { return Vec4d(mul_add(a.get_low(),b.get_low(),c.get_low()), mul_add(a.get_high(),b.get_high(),c.get_high())); } // Multiply and subtract static inline Vec4d mul_sub(Vec4d const & a, Vec4d const & b, Vec4d const & c) { return Vec4d(mul_sub(a.get_low(),b.get_low(),c.get_low()), mul_sub(a.get_high(),b.get_high(),c.get_high())); } // Multiply and inverse subtract static inline Vec4d nmul_add(Vec4d const & a, Vec4d const & b, Vec4d const & c) { return Vec4d(nmul_add(a.get_low(),b.get_low(),c.get_low()), nmul_add(a.get_high(),b.get_high(),c.get_high())); } // Multiply and subtract with extra precision on the intermediate calculations, // even if FMA instructions not supported, using Veltkamp-Dekker split static inline Vec4d mul_sub_x(Vec4d const & a, Vec4d const & b, Vec4d const & c) { return Vec4d(mul_sub_x(a.get_low(),b.get_low(),c.get_low()), mul_sub_x(a.get_high(),b.get_high(),c.get_high())); } // Math functions using fast bit manipulation #ifdef VECTORI256_H // 256 bit integer vectors are available, AVX2 // Extract the exponent as an integer // exponent(a) = floor(log2(abs(a))); // exponent(1.0) = 0, exponent(0.0) = -1023, exponent(INF) = +1024, exponent(NAN) = +1024 static inline Vec4q exponent(Vec4d const & a) { return Vec4q(exponent(a.get_low()), exponent(a.get_high())); } // Extract the fraction part of a floating point number // a = 2^exponent(a) * fraction(a), except for a = 0 // fraction(1.0) = 1.0, fraction(5.0) = 1.25 static inline Vec4d fraction(Vec4d const & a) { return Vec4d(fraction(a.get_low()), fraction(a.get_high())); } // Fast calculation of pow(2,n) with n integer // n = 0 gives 1.0 // n >= 1024 gives +INF // n <= -1023 gives 0.0 // This function will never produce denormals, and never raise exceptions static inline Vec4d exp2(Vec4q const & a) { return Vec4d(exp2(a.get_low()), exp2(a.get_high())); } //static Vec4d exp2(Vec4d const & x); // defined in vectormath_exp.h #endif // Categorization functions // Function sign_bit: gives true for elements that have the sign bit set // even for -0.0, -INF and -NAN // Note that sign_bit(Vec4d(-0.0)) gives true, while Vec4d(-0.0) < Vec4d(0.0) gives false static inline Vec4db sign_bit(Vec4d const & a) { return Vec4db(sign_bit(a.get_low()), sign_bit(a.get_high())); } // Function sign_combine: changes the sign of a when b has the sign bit set // same as select(sign_bit(b), -a, a) static inline Vec4d sign_combine(Vec4d const & a, Vec4d const & b) { return Vec4d(sign_combine(a.get_low(), b.get_low()), sign_combine(a.get_high(), b.get_high())); } // Function is_finite: gives true for elements that are normal, denormal or zero, // false for INF and NAN static inline Vec4db is_finite(Vec4d const & a) { return Vec4db(is_finite(a.get_low()), is_finite(a.get_high())); } // Function is_inf: gives true for elements that are +INF or -INF // false for finite numbers and NAN static inline Vec4db is_inf(Vec4d const & a) { return Vec4db(is_inf(a.get_low()), is_inf(a.get_high())); } // Function is_nan: gives true for elements that are +NAN or -NAN // false for finite numbers and +/-INF static inline Vec4db is_nan(Vec4d const & a) { return Vec4db(is_nan(a.get_low()), is_nan(a.get_high())); } // Function is_subnormal: gives true for elements that are denormal (subnormal) // false for finite numbers, zero, NAN and INF static inline Vec4db is_subnormal(Vec4d const & a) { return Vec4db(is_subnormal(a.get_low()), is_subnormal(a.get_high())); } // Function is_zero_or_subnormal: gives true for elements that are zero or subnormal (denormal) // false for finite numbers, NAN and INF static inline Vec4db is_zero_or_subnormal(Vec4d const & a) { return Vec4db(is_zero_or_subnormal(a.get_low()),is_zero_or_subnormal(a.get_high())); } // Function infinite2d: returns a vector where all elements are +INF static inline Vec4d infinite4d() { return Vec4d(infinite2d(), infinite2d()); } // Function nan2d: returns a vector where all elements are +NAN (quiet) static inline Vec4d nan4d(int n = 0x10) { return Vec4d(nan2d(n), nan2d(n)); } // change signs on vectors Vec4d // Each index i0 - i3 is 1 for changing sign on the corresponding element, 0 for no change template static inline Vec4d change_sign(Vec4d const & a) { if ((i0 | i1 | i2 | i3) == 0) return a; Vec2d lo = change_sign(a.get_low()); Vec2d hi = change_sign(a.get_high()); return Vec4d(lo, hi); } /***************************************************************************** * * Functions for reinterpretation between vector types * *****************************************************************************/ static inline Vec256ie reinterpret_i (Vec256ie const & x) { return x; } static inline Vec256ie reinterpret_i (Vec256fe const & x) { return Vec256ie(reinterpret_i(x.get_low()), reinterpret_i(x.get_high())); } static inline Vec256ie reinterpret_i (Vec256de const & x) { return Vec256ie(reinterpret_i(x.get_low()), reinterpret_i(x.get_high())); } static inline Vec256fe reinterpret_f (Vec256ie const & x) { return Vec256fe(reinterpret_f(x.get_low()), reinterpret_f(x.get_high())); } static inline Vec256fe reinterpret_f (Vec256fe const & x) { return x; } static inline Vec256fe reinterpret_f (Vec256de const & x) { return Vec256fe(reinterpret_f(x.get_low()), reinterpret_f(x.get_high())); } static inline Vec256de reinterpret_d (Vec256ie const & x) { return Vec256de(reinterpret_d(x.get_low()), reinterpret_d(x.get_high())); } static inline Vec256de reinterpret_d (Vec256fe const & x) { return Vec256de(reinterpret_d(x.get_low()), reinterpret_d(x.get_high())); } static inline Vec256de reinterpret_d (Vec256de const & x) { return x; } /***************************************************************************** * * Vector permute and blend functions * ****************************************************************************** * * The permute function can reorder the elements of a vector and optionally * set some elements to zero. * * The indexes are inserted as template parameters in <>. These indexes must be * constants. Each template parameter is an index to the element you want to * select. An index of -1 will generate zero. An index of -256 means don't care. * * Example: * Vec4d a(10., 11., 12., 13.); // a is (10, 11, 12, 13) * Vec4d b; * b = permute4d<1,0,-1,3>(a); // b is (11, 10, 0, 13) * * * The blend function can mix elements from two different vectors and * optionally set some elements to zero. * * The indexes are inserted as template parameters in <>. These indexes must be * constants. Each template parameter is an index to the element you want to * select, where indexes 0 - 3 indicate an element from the first source * vector and indexes 4 - 7 indicate an element from the second source vector. * A negative index will generate zero. * * * Example: * Vec4d a(10., 11., 12., 13.); // a is (10, 11, 12, 13) * Vec4d b(20., 21., 22., 23.); // a is (20, 21, 22, 23) * Vec4d c; * c = blend4d<4,3,7,-1> (a,b); // c is (20, 13, 23, 0) *****************************************************************************/ // permute vector Vec4d template static inline Vec4d permute4d(Vec4d const & a) { return Vec4d(blend2d (a.get_low(), a.get_high()), blend2d (a.get_low(), a.get_high())); } // helper function used below template static inline Vec2d select4(Vec4d const & a, Vec4d const & b) { switch (n) { case 0: return a.get_low(); case 1: return a.get_high(); case 2: return b.get_low(); case 3: return b.get_high(); } return _mm_setzero_pd(); } // blend vectors Vec4d template static inline Vec4d blend4d(Vec4d const & a, Vec4d const & b) { const int j0 = i0 >= 0 ? i0/2 : i0; const int j1 = i1 >= 0 ? i1/2 : i1; const int j2 = i2 >= 0 ? i2/2 : i2; const int j3 = i3 >= 0 ? i3/2 : i3; Vec2d x0, x1; if (j0 == j1 || i0 < 0 || i1 < 0) { // both from same const int k0 = j0 >= 0 ? j0 : j1; x0 = permute2d (select4 (a,b)); } else { x0 = blend2d (select4(a,b), select4(a,b)); } if (j2 == j3 || i2 < 0 || i3 < 0) { // both from same const int k1 = j2 >= 0 ? j2 : j3; x1 = permute2d (select4 (a,b)); } else { x1 = blend2d (select4(a,b), select4(a,b)); } return Vec4d(x0,x1); } /***************************************************************************** * * Vector Vec8f permute and blend functions * *****************************************************************************/ // permute vector Vec8f template static inline Vec8f permute8f(Vec8f const & a) { return Vec8f(blend4f (a.get_low(), a.get_high()), blend4f (a.get_low(), a.get_high())); } // helper function used below template static inline Vec4f select4(Vec8f const & a, Vec8f const & b) { switch (n) { case 0: return a.get_low(); case 1: return a.get_high(); case 2: return b.get_low(); case 3: return b.get_high(); } return _mm_setzero_ps(); } // blend vectors Vec8f template static inline Vec8f blend8f(Vec8f const & a, Vec8f const & b) { const int j0 = i0 >= 0 ? i0/4 : i0; const int j1 = i1 >= 0 ? i1/4 : i1; const int j2 = i2 >= 0 ? i2/4 : i2; const int j3 = i3 >= 0 ? i3/4 : i3; const int j4 = i4 >= 0 ? i4/4 : i4; const int j5 = i5 >= 0 ? i5/4 : i5; const int j6 = i6 >= 0 ? i6/4 : i6; const int j7 = i7 >= 0 ? i7/4 : i7; Vec4f x0, x1; const int r0 = j0 >= 0 ? j0 : j1 >= 0 ? j1 : j2 >= 0 ? j2 : j3; const int r1 = j4 >= 0 ? j4 : j5 >= 0 ? j5 : j6 >= 0 ? j6 : j7; const int s0 = (j1 >= 0 && j1 != r0) ? j1 : (j2 >= 0 && j2 != r0) ? j2 : j3; const int s1 = (j5 >= 0 && j5 != r1) ? j5 : (j6 >= 0 && j6 != r1) ? j6 : j7; // Combine all the indexes into a single bitfield, with 4 bits for each const int m1 = (i0&0xF) | (i1&0xF)<<4 | (i2&0xF)<<8 | (i3&0xF)<<12 | (i4&0xF)<<16 | (i5&0xF)<<20 | (i6&0xF)<<24 | (i7&0xF)<<28; // Mask to zero out negative indexes const int mz = (i0<0?0:0xF) | (i1<0?0:0xF)<<4 | (i2<0?0:0xF)<<8 | (i3<0?0:0xF)<<12 | (i4<0?0:0xF)<<16 | (i5<0?0:0xF)<<20 | (i6<0?0:0xF)<<24 | (i7<0?0:0xF)<<28; if (r0 < 0) { x0 = _mm_setzero_ps(); } else if (((m1 ^ r0*0x4444) & 0xCCCC & mz) == 0) { // i0 - i3 all from same source x0 = permute4f (select4 (a,b)); } else if ((j2 < 0 || j2 == r0 || j2 == s0) && (j3 < 0 || j3 == r0 || j3 == s0)) { // i0 - i3 all from two sources const int k0 = i0 >= 0 ? i0 & 3 : i0; const int k1 = (i1 >= 0 ? i1 & 3 : i1) | (j1 == s0 ? 4 : 0); const int k2 = (i2 >= 0 ? i2 & 3 : i2) | (j2 == s0 ? 4 : 0); const int k3 = (i3 >= 0 ? i3 & 3 : i3) | (j3 == s0 ? 4 : 0); x0 = blend4f (select4(a,b), select4(a,b)); } else { // i0 - i3 from three or four different sources x0 = blend4f<0,1,6,7> ( blend4f (select4(a,b), select4(a,b)), blend4f<-0x100, -0x100, i2 & -13, (i3 & -13) | 4> (select4(a,b), select4(a,b))); } if (r1 < 0) { x1 = _mm_setzero_ps(); } else if (((m1 ^ r1*0x44440000u) & 0xCCCC0000 & mz) == 0) { // i4 - i7 all from same source x1 = permute4f (select4 (a,b)); } else if ((j6 < 0 || j6 == r1 || j6 == s1) && (j7 < 0 || j7 == r1 || j7 == s1)) { // i4 - i7 all from two sources const int k4 = i4 >= 0 ? i4 & 3 : i4; const int k5 = (i5 >= 0 ? i5 & 3 : i5) | (j5 == s1 ? 4 : 0); const int k6 = (i6 >= 0 ? i6 & 3 : i6) | (j6 == s1 ? 4 : 0); const int k7 = (i7 >= 0 ? i7 & 3 : i7) | (j7 == s1 ? 4 : 0); x1 = blend4f (select4(a,b), select4(a,b)); } else { // i4 - i7 from three or four different sources x1 = blend4f<0,1,6,7> ( blend4f (select4(a,b), select4(a,b)), blend4f<-0x100, -0x100, i6 & -13, (i7 & -13) | 4> (select4(a,b), select4(a,b))); } return Vec8f(x0,x1); } /***************************************************************************** * * Vector lookup functions * ****************************************************************************** * * These functions use vector elements as indexes into a table. * The table is given as one or more vectors or as an array. * * This can be used for several purposes: * - table lookup * - permute or blend with variable indexes * - blend from more than two sources * - gather non-contiguous data * * An index out of range may produce any value - the actual value produced is * implementation dependent and may be different for different instruction * sets. An index out of range does not produce an error message or exception. * * Example: * Vec4i a(2,0,0,3); // index a is ( 2, 0, 0, 3) * Vec4f b(1.0f,1.1f,1.2f,1.3f); // table b is (1.0, 1.1, 1.2, 1.3) * Vec4f c; * c = lookup4 (a,b); // result c is (1.2, 1.0, 1.0, 1.3) * *****************************************************************************/ #ifdef VECTORI256_H // Vec8i and Vec4q must be defined static inline Vec8f lookup8(Vec8i const & index, Vec8f const & table) { Vec4f r0 = lookup8(index.get_low() , table.get_low(), table.get_high()); Vec4f r1 = lookup8(index.get_high(), table.get_low(), table.get_high()); return Vec8f(r0, r1); } template static inline Vec8f lookup(Vec8i const & index, float const * table) { if (n <= 0) return 0; if (n <= 4) { Vec4f table1 = Vec4f().load(table); return Vec8f( lookup4 (index.get_low(), table1), lookup4 (index.get_high(), table1)); } if (n <= 8) { return lookup8(index, Vec8f().load(table)); } // Limit index Vec8ui index1; if ((n & (n-1)) == 0) { // n is a power of 2, make index modulo n index1 = Vec8ui(index) & (n-1); } else { // n is not a power of 2, limit to n-1 index1 = min(Vec8ui(index), n-1); } return Vec8f(table[index1[0]],table[index1[1]],table[index1[2]],table[index1[3]], table[index1[4]],table[index1[5]],table[index1[6]],table[index1[7]]); } static inline Vec4d lookup4(Vec4q const & index, Vec4d const & table) { Vec2d r0 = lookup4(index.get_low() , table.get_low(), table.get_high()); Vec2d r1 = lookup4(index.get_high(), table.get_low(), table.get_high()); return Vec4d(r0, r1); } template static inline Vec4d lookup(Vec4q const & index, double const * table) { if (n <= 0) return 0; if (n <= 2) { Vec2d table1 = Vec2d().load(table); return Vec4d( lookup2 (index.get_low(), table1), lookup2 (index.get_high(), table1)); } // Limit index Vec8ui index1; if ((n & (n-1)) == 0) { // n is a power of 2, make index modulo n index1 = Vec8ui(index) & constant8i(); } else { // n is not a power of 2, limit to n-1 index1 = min(Vec8ui(index), constant8i() ); } Vec4q index2 = Vec4q(index1); return Vec4d(table[index2[0]],table[index2[1]],table[index2[2]],table[index2[3]]); } #endif // VECTORI256_H /***************************************************************************** * * Horizontal scan functions * *****************************************************************************/ // Get index to the first element that is true. Return -1 if all are false static inline int horizontal_find_first(Vec8fb const & x) { return horizontal_find_first(Vec8ib(x)); } static inline int horizontal_find_first(Vec4db const & x) { return horizontal_find_first(Vec4qb(x)); } // Count the number of elements that are true static inline uint32_t horizontal_count(Vec8fb const & x) { return horizontal_count(Vec8ib(x)); } static inline uint32_t horizontal_count(Vec4db const & x) { return horizontal_count(Vec4qb(x)); } /***************************************************************************** * * Boolean <-> bitfield conversion functions * *****************************************************************************/ // to_bits: convert boolean vector to integer bitfield static inline uint8_t to_bits(Vec8fb const & x) { return to_bits(Vec8ib(x)); } // to_Vec8fb: convert integer bitfield to boolean vector static inline Vec8fb to_Vec8fb(uint8_t x) { return Vec8fb(to_Vec8ib(x)); } // to_bits: convert boolean vector to integer bitfield static inline uint8_t to_bits(Vec4db const & x) { return to_bits(Vec4qb(x)); } // to_Vec4db: convert integer bitfield to boolean vector static inline Vec4db to_Vec4db(uint8_t x) { return Vec4db(to_Vec4qb(x)); } #endif // VECTORF256_H