/**************************** vectori128.h ******************************* * Author: Agner Fog * Date created: 2012-05-30 * Last modified: 2014-10-24 * Version: 1.16 * Project: vector classes * Description: * Header file defining integer vector classes as interface to intrinsic * functions in x86 microprocessors with SSE2 and later instruction sets * up to AVX. * * Instructions: * Use Gnu, Intel or Microsoft C++ compiler. Compile for the desired * instruction set, which must be at least SSE2. Specify the supported * instruction set by a command line define, e.g. __SSE4_1__ if the * compiler does not automatically do so. * * The following vector classes are defined here: * Vec128b Vector of 128 1-bit unsigned integers or Booleans * Vec16c Vector of 16 8-bit signed integers * Vec16uc Vector of 16 8-bit unsigned integers * Vec16cb Vector of 16 Booleans for use with Vec16c and Vec16uc * Vec8s Vector of 8 16-bit signed integers * Vec8us Vector of 8 16-bit unsigned integers * Vec8sb Vector of 8 Booleans for use with Vec8s and Vec8us * Vec4i Vector of 4 32-bit signed integers * Vec4ui Vector of 4 32-bit unsigned integers * Vec4ib Vector of 4 Booleans for use with Vec4i and Vec4ui * Vec2q Vector of 2 64-bit signed integers * Vec2uq Vector of 2 64-bit unsigned integers * Vec2qb Vector of 2 Booleans for use with Vec2q and Vec2uq * * Each vector object is represented internally in the CPU as a 128-bit register. * This header file defines operators and functions for these vectors. * * For example: * Vec4i a(1,2,3,4), b(5,6,7,8), c; * c = a + b; // now c contains (6,8,10,12) * * For detailed instructions, see VectorClass.pdf * * (c) Copyright 2012 - 2013 GNU General Public License http://www.gnu.org/licenses *****************************************************************************/ #ifndef VECTORI128_H #define VECTORI128_H #include "instrset.h" // Select supported instruction set #if INSTRSET < 2 // SSE2 required #error Please compile for the SSE2 instruction set or higher #endif /***************************************************************************** * * Vector of 128 1-bit unsigned integers or Booleans * *****************************************************************************/ class Vec128b { protected: __m128i xmm; // Integer vector public: // Default constructor: Vec128b() { } // Constructor to broadcast the same value into all elements // Removed because of undesired implicit conversions // Vec128b(int i) { // xmm = _mm_set1_epi32(-(i & 1));} // Constructor to convert from type __m128i used in intrinsics: Vec128b(__m128i const & x) { xmm = x; } // Assignment operator to convert from type __m128i used in intrinsics: Vec128b & operator = (__m128i const & x) { xmm = x; return *this; } // Type cast operator to convert to __m128i used in intrinsics operator __m128i() const { return xmm; } // Member function to load from array (unaligned) Vec128b & load(void const * p) { xmm = _mm_loadu_si128((__m128i const*)p); return *this; } // Member function to load from array, aligned by 16 // "load_a" is faster than "load" on older Intel processors (Pentium 4, Pentium M, Core 1, // Merom, Wolfdale) and Atom, but not on other processors from Intel, AMD or VIA. // You may use load_a instead of load if you are certain that p points to an address // divisible by 16. void load_a(void const * p) { xmm = _mm_load_si128((__m128i const*)p); } // Member function to store into array (unaligned) void store(void * p) const { _mm_storeu_si128((__m128i*)p, xmm); } // Member function to store into array, aligned by 16 // "store_a" is faster than "store" on older Intel processors (Pentium 4, Pentium M, Core 1, // Merom, Wolfdale) and Atom, but not on other processors from Intel, AMD or VIA. // You may use store_a instead of store if you are certain that p points to an address // divisible by 16. void store_a(void * p) const { _mm_store_si128((__m128i*)p, xmm); } // Member function to change a single bit // Note: This function is inefficient. Use load function if changing more than one bit Vec128b const & set_bit(uint32_t index, int value) { static const union { uint64_t i[4]; __m128i x[2]; } u = {{1,0,0,1}}; // 2 vectors with bit 0 and 64 set, respectively int w = (index >> 6) & 1; // qword index int bi = index & 0x3F; // bit index within qword w __m128i mask = u.x[w]; mask = _mm_sll_epi64(mask,_mm_cvtsi32_si128(bi)); // mask with bit number b set if (value & 1) { xmm = _mm_or_si128(mask,xmm); } else { xmm = _mm_andnot_si128(mask,xmm); } return *this; } // Member function to get a single bit // Note: This function is inefficient. Use store function if reading more than one bit int get_bit(uint32_t index) const { union { __m128i x; uint8_t i[16]; } u; u.x = xmm; int w = (index >> 3) & 0xF; // byte index int bi = index & 7; // bit index within byte w return (u.i[w] >> bi) & 1; } // Extract a single element. Use store function if extracting more than one element. // Operator [] can only read an element, not write. bool operator [] (uint32_t index) const { return get_bit(index) != 0; } static int size() { return 128; } }; // Define operators for this class // vector operator & : bitwise and static inline Vec128b operator & (Vec128b const & a, Vec128b const & b) { return _mm_and_si128(a, b); } static inline Vec128b operator && (Vec128b const & a, Vec128b const & b) { return a & b; } // vector operator | : bitwise or static inline Vec128b operator | (Vec128b const & a, Vec128b const & b) { return _mm_or_si128(a, b); } static inline Vec128b operator || (Vec128b const & a, Vec128b const & b) { return a | b; } // vector operator ^ : bitwise xor static inline Vec128b operator ^ (Vec128b const & a, Vec128b const & b) { return _mm_xor_si128(a, b); } // vector operator ~ : bitwise not static inline Vec128b operator ~ (Vec128b const & a) { return _mm_xor_si128(a, _mm_set1_epi32(-1)); } // vector operator &= : bitwise and static inline Vec128b & operator &= (Vec128b & a, Vec128b const & b) { a = a & b; return a; } // vector operator |= : bitwise or static inline Vec128b & operator |= (Vec128b & a, Vec128b const & b) { a = a | b; return a; } // vector operator ^= : bitwise xor static inline Vec128b & operator ^= (Vec128b & a, Vec128b const & b) { a = a ^ b; return a; } // Define functions for this class // function andnot: a & ~ b static inline Vec128b andnot (Vec128b const & a, Vec128b const & b) { return _mm_andnot_si128(b, a); } /***************************************************************************** * * Generate compile-time constant vector * *****************************************************************************/ // Generate a constant vector of 4 integers stored in memory. // Can be converted to any integer vector type template static inline __m128i constant4i() { static const union { int i[4]; __m128i xmm; } u = {{i0,i1,i2,i3}}; return u.xmm; } /***************************************************************************** * * selectb function * *****************************************************************************/ // Select between two sources, byte by byte. Used in various functions and operators // Corresponds to this pseudocode: // for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i]; // Each byte in s must be either 0 (false) or 0xFF (true). No other values are allowed. // The implementation depends on the instruction set: // If SSE4.1 is supported then only bit 7 in each byte of s is checked, // otherwise all bits in s are used. static inline __m128i selectb (__m128i const & s, __m128i const & a, __m128i const & b) { #if INSTRSET >= 5 // SSE4.1 supported return _mm_blendv_epi8 (b, a, s); #else return _mm_or_si128( _mm_and_si128(s,a), _mm_andnot_si128(s,b)); #endif } /***************************************************************************** * * Horizontal Boolean functions * *****************************************************************************/ // horizontal_and. Returns true if all bits are 1 static inline bool horizontal_and (Vec128b const & a) { #if INSTRSET >= 5 // SSE4.1 supported. Use PTEST return _mm_testc_si128(a,constant4i<-1,-1,-1,-1>()) != 0; #else __m128i t1 = _mm_unpackhi_epi64(a,a); // get 64 bits down __m128i t2 = _mm_and_si128(a,t1); // and 64 bits #ifdef __x86_64__ int64_t t5 = _mm_cvtsi128_si64(t2); // transfer 64 bits to integer return t5 == int64_t(-1); #else __m128i t3 = _mm_srli_epi64(t2,32); // get 32 bits down __m128i t4 = _mm_and_si128(t2,t3); // and 32 bits int t5 = _mm_cvtsi128_si32(t4); // transfer 32 bits to integer return t5 == -1; #endif // __x86_64__ #endif // INSTRSET } // horizontal_or. Returns true if at least one bit is 1 static inline bool horizontal_or (Vec128b const & a) { #if INSTRSET >= 5 // SSE4.1 supported. Use PTEST return ! _mm_testz_si128(a,a); #else __m128i t1 = _mm_unpackhi_epi64(a,a); // get 64 bits down __m128i t2 = _mm_or_si128(a,t1); // and 64 bits #ifdef __x86_64__ int64_t t5 = _mm_cvtsi128_si64(t2); // transfer 64 bits to integer return t5 != int64_t(0); #else __m128i t3 = _mm_srli_epi64(t2,32); // get 32 bits down __m128i t4 = _mm_or_si128(t2,t3); // and 32 bits int t5 = _mm_cvtsi128_si32(t4); // transfer to integer return t5 != 0; #endif // __x86_64__ #endif // INSTRSET } /***************************************************************************** * * Vector of 16 8-bit signed integers * *****************************************************************************/ class Vec16c : public Vec128b { public: // Default constructor: Vec16c() { } // Constructor to broadcast the same value into all elements: Vec16c(int i) { xmm = _mm_set1_epi8((char)i); } // Constructor to build from all elements: Vec16c(int8_t i0, int8_t i1, int8_t i2, int8_t i3, int8_t i4, int8_t i5, int8_t i6, int8_t i7, int8_t i8, int8_t i9, int8_t i10, int8_t i11, int8_t i12, int8_t i13, int8_t i14, int8_t i15) { xmm = _mm_setr_epi8(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15); } // Constructor to convert from type __m128i used in intrinsics: Vec16c(__m128i const & x) { xmm = x; } // Assignment operator to convert from type __m128i used in intrinsics: Vec16c & operator = (__m128i const & x) { xmm = x; return *this; } // Type cast operator to convert to __m128i used in intrinsics operator __m128i() const { return xmm; } // Member function to load from array (unaligned) Vec16c & load(void const * p) { xmm = _mm_loadu_si128((__m128i const*)p); return *this; } // Member function to load from array (aligned) Vec16c & load_a(void const * p) { xmm = _mm_load_si128((__m128i const*)p); return *this; } // Partial load. Load n elements and set the rest to 0 Vec16c & load_partial(int n, void const * p) { if (n >= 16) load(p); else if (n <= 0) *this = 0; else if (((int)(intptr_t)p & 0xFFF) < 0xFF0) { // p is at least 16 bytes from a page boundary. OK to read 16 bytes load(p); } else { // worst case. read 1 byte at a time and suffer store forwarding penalty char x[16]; for (int i = 0; i < n; i++) x[i] = ((char *)p)[i]; load(x); } cutoff(n); return *this; } // Partial store. Store n elements void store_partial(int n, void * p) const { if (n >= 16) { store(p); return; } if (n <= 0) return; // we are not using _mm_maskmoveu_si128 because it is too slow on many processors union { int8_t c[16]; int16_t s[8]; int32_t i[4]; int64_t q[2]; } u; store(u.c); int j = 0; if (n & 8) { *(int64_t*)p = u.q[0]; j += 8; } if (n & 4) { ((int32_t*)p)[j/4] = u.i[j/4]; j += 4; } if (n & 2) { ((int16_t*)p)[j/2] = u.s[j/2]; j += 2; } if (n & 1) { ((int8_t*)p)[j] = u.c[j]; } } // cut off vector to n elements. The last 16-n elements are set to zero Vec16c & cutoff(int n) { if (uint32_t(n) >= 16) return *this; static const char mask[32] = {-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; *this &= Vec16c().load(mask+16-n); return *this; } // Member function to change a single element in vector // Note: This function is inefficient. Use load function if changing more than one element Vec16c const & insert(uint32_t index, int8_t value) { static const int8_t maskl[32] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; __m128i broad = _mm_set1_epi8(value); // broadcast value into all elements __m128i mask = _mm_loadu_si128((__m128i const*)(maskl+16-(index & 0x0F))); // mask with FF at index position xmm = selectb(mask,broad,xmm); return *this; } // Member function extract a single element from vector int8_t extract(uint32_t index) const { int8_t x[16]; store(x); return x[index & 0x0F]; } // Extract a single element. Use store function if extracting more than one element. // Operator [] can only read an element, not write. int8_t operator [] (uint32_t index) const { return extract(index); } static int size() { return 16; } }; /***************************************************************************** * * Vec16cb: Vector of 16 Booleans for use with Vec16c and Vec16uc * *****************************************************************************/ class Vec16cb : public Vec16c { public: // Default constructor Vec16cb() {} // Constructor to build from all elements: Vec16cb(bool x0, bool x1, bool x2, bool x3, bool x4, bool x5, bool x6, bool x7, bool x8, bool x9, bool x10, bool x11, bool x12, bool x13, bool x14, bool x15) { xmm = Vec16c(-int8_t(x0), -int8_t(x1), -int8_t(x2), -int8_t(x3), -int8_t(x4), -int8_t(x5), -int8_t(x6), -int8_t(x7), -int8_t(x8), -int8_t(x9), -int8_t(x10), -int8_t(x11), -int8_t(x12), -int8_t(x13), -int8_t(x14), -int8_t(x15)); } // Constructor to convert from type __m128i used in intrinsics: Vec16cb(__m128i const & x) { xmm = x; } // Assignment operator to convert from type __m128i used in intrinsics: Vec16cb & operator = (__m128i const & x) { xmm = x; return *this; } // Constructor to broadcast scalar value: Vec16cb(bool b) : Vec16c(-int8_t(b)) { } // Assignment operator to broadcast scalar value: Vec16cb & operator = (bool b) { *this = Vec16cb(b); return *this; } private: // Prevent constructing from int, etc. Vec16cb(int b); Vec16cb & operator = (int x); public: Vec16cb & insert (int index, bool a) { Vec16c::insert(index, -(int)a); return *this; } // Member function extract a single element from vector bool extract(uint32_t index) const { return Vec16c::extract(index) != 0; } // Extract a single element. Use store function if extracting more than one element. // Operator [] can only read an element, not write. bool operator [] (uint32_t index) const { return extract(index); } }; /***************************************************************************** * * Define operators for Vec16cb * *****************************************************************************/ // vector operator & : bitwise and static inline Vec16cb operator & (Vec16cb const & a, Vec16cb const & b) { return Vec16cb(Vec128b(a) & Vec128b(b)); } static inline Vec16cb operator && (Vec16cb const & a, Vec16cb const & b) { return a & b; } // vector operator &= : bitwise and static inline Vec16cb & operator &= (Vec16cb & a, Vec16cb const & b) { a = a & b; return a; } // vector operator | : bitwise or static inline Vec16cb operator | (Vec16cb const & a, Vec16cb const & b) { return Vec16cb(Vec128b(a) | Vec128b(b)); } static inline Vec16cb operator || (Vec16cb const & a, Vec16cb const & b) { return a | b; } // vector operator |= : bitwise or static inline Vec16cb & operator |= (Vec16cb & a, Vec16cb const & b) { a = a | b; return a; } // vector operator ^ : bitwise xor static inline Vec16cb operator ^ (Vec16cb const & a, Vec16cb const & b) { return Vec16cb(Vec128b(a) ^ Vec128b(b)); } // vector operator ^= : bitwise xor static inline Vec16cb & operator ^= (Vec16cb & a, Vec16cb const & b) { a = a ^ b; return a; } // vector operator ~ : bitwise not static inline Vec16cb operator ~ (Vec16cb const & a) { return Vec16cb( ~ Vec128b(a)); } // vector operator ! : element not static inline Vec16cb operator ! (Vec16cb const & a) { return ~ a; } // vector function andnot static inline Vec16cb andnot (Vec16cb const & a, Vec16cb const & b) { return Vec16cb(andnot(Vec128b(a), Vec128b(b))); } /***************************************************************************** * * Define operators for Vec16c * *****************************************************************************/ // vector operator + : add element by element static inline Vec16c operator + (Vec16c const & a, Vec16c const & b) { return _mm_add_epi8(a, b); } // vector operator += : add static inline Vec16c & operator += (Vec16c & a, Vec16c const & b) { a = a + b; return a; } // postfix operator ++ static inline Vec16c operator ++ (Vec16c & a, int) { Vec16c a0 = a; a = a + 1; return a0; } // prefix operator ++ static inline Vec16c & operator ++ (Vec16c & a) { a = a + 1; return a; } // vector operator - : subtract element by element static inline Vec16c operator - (Vec16c const & a, Vec16c const & b) { return _mm_sub_epi8(a, b); } // vector operator - : unary minus static inline Vec16c operator - (Vec16c const & a) { return _mm_sub_epi8(_mm_setzero_si128(), a); } // vector operator -= : add static inline Vec16c & operator -= (Vec16c & a, Vec16c const & b) { a = a - b; return a; } // postfix operator -- static inline Vec16c operator -- (Vec16c & a, int) { Vec16c a0 = a; a = a - 1; return a0; } // prefix operator -- static inline Vec16c & operator -- (Vec16c & a) { a = a - 1; return a; } // vector operator * : multiply element by element static inline Vec16c operator * (Vec16c const & a, Vec16c const & b) { // There is no 8-bit multiply in SSE2. Split into two 16-bit multiplies __m128i aodd = _mm_srli_epi16(a,8); // odd numbered elements of a __m128i bodd = _mm_srli_epi16(b,8); // odd numbered elements of b __m128i muleven = _mm_mullo_epi16(a,b); // product of even numbered elements __m128i mulodd = _mm_mullo_epi16(aodd,bodd); // product of odd numbered elements mulodd = _mm_slli_epi16(mulodd,8); // put odd numbered elements back in place __m128i mask = _mm_set1_epi32(0x00FF00FF); // mask for even positions __m128i product = selectb(mask,muleven,mulodd); // interleave even and odd return product; } // vector operator *= : multiply static inline Vec16c & operator *= (Vec16c & a, Vec16c const & b) { a = a * b; return a; } // vector operator << : shift left all elements static inline Vec16c operator << (Vec16c const & a, int b) { uint32_t mask = (uint32_t)0xFF >> (uint32_t)b; // mask to remove bits that are shifted out __m128i am = _mm_and_si128(a,_mm_set1_epi8((char)mask)); // remove bits that will overflow __m128i res = _mm_sll_epi16(am,_mm_cvtsi32_si128(b));// 16-bit shifts return res; } // vector operator <<= : shift left static inline Vec16c & operator <<= (Vec16c & a, int b) { a = a << b; return a; } // vector operator >> : shift right arithmetic all elements static inline Vec16c operator >> (Vec16c const & a, int b) { __m128i aeven = _mm_slli_epi16(a,8); // even numbered elements of a. get sign bit in position aeven = _mm_sra_epi16(aeven,_mm_cvtsi32_si128(b+8)); // shift arithmetic, back to position __m128i aodd = _mm_sra_epi16(a,_mm_cvtsi32_si128(b)); // shift odd numbered elements arithmetic __m128i mask = _mm_set1_epi32(0x00FF00FF); // mask for even positions __m128i res = selectb(mask,aeven,aodd); // interleave even and odd return res; } // vector operator >>= : shift right arithmetic static inline Vec16c & operator >>= (Vec16c & a, int b) { a = a >> b; return a; } // vector operator == : returns true for elements for which a == b static inline Vec16cb operator == (Vec16c const & a, Vec16c const & b) { return _mm_cmpeq_epi8(a,b); } // vector operator != : returns true for elements for which a != b static inline Vec16cb operator != (Vec16c const & a, Vec16c const & b) { #ifdef __XOP__ // AMD XOP instruction set return _mm_comneq_epi8(a,b); #else // SSE2 instruction set return Vec16cb(Vec16c(~(a == b))); #endif } // vector operator > : returns true for elements for which a > b (signed) static inline Vec16cb operator > (Vec16c const & a, Vec16c const & b) { return _mm_cmpgt_epi8(a,b); } // vector operator < : returns true for elements for which a < b (signed) static inline Vec16cb operator < (Vec16c const & a, Vec16c const & b) { return b > a; } // vector operator >= : returns true for elements for which a >= b (signed) static inline Vec16cb operator >= (Vec16c const & a, Vec16c const & b) { #ifdef __XOP__ // AMD XOP instruction set return _mm_comge_epi8(a,b); #else // SSE2 instruction set return Vec16cb(Vec16c(~(b > a))); #endif } // vector operator <= : returns true for elements for which a <= b (signed) static inline Vec16cb operator <= (Vec16c const & a, Vec16c const & b) { return b >= a; } // vector operator & : bitwise and static inline Vec16c operator & (Vec16c const & a, Vec16c const & b) { return Vec16c(Vec128b(a) & Vec128b(b)); } static inline Vec16c operator && (Vec16c const & a, Vec16c const & b) { return a & b; } // vector operator &= : bitwise and static inline Vec16c & operator &= (Vec16c & a, Vec16c const & b) { a = a & b; return a; } // vector operator | : bitwise or static inline Vec16c operator | (Vec16c const & a, Vec16c const & b) { return Vec16c(Vec128b(a) | Vec128b(b)); } static inline Vec16c operator || (Vec16c const & a, Vec16c const & b) { return a | b; } // vector operator |= : bitwise or static inline Vec16c & operator |= (Vec16c & a, Vec16c const & b) { a = a | b; return a; } // vector operator ^ : bitwise xor static inline Vec16c operator ^ (Vec16c const & a, Vec16c const & b) { return Vec16c(Vec128b(a) ^ Vec128b(b)); } // vector operator ^= : bitwise xor static inline Vec16c & operator ^= (Vec16c & a, Vec16c const & b) { a = a ^ b; return a; } // vector operator ~ : bitwise not static inline Vec16c operator ~ (Vec16c const & a) { return Vec16c( ~ Vec128b(a)); } // vector operator ! : logical not, returns true for elements == 0 static inline Vec16cb operator ! (Vec16c const & a) { return _mm_cmpeq_epi8(a,_mm_setzero_si128()); } // Functions for this class // Select between two operands. Corresponds to this pseudocode: // for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i]; // Each byte in s must be either 0 (false) or -1 (true). No other values are allowed. static inline Vec16c select (Vec16cb const & s, Vec16c const & a, Vec16c const & b) { return selectb(s,a,b); } // Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i] static inline Vec16c if_add (Vec16cb const & f, Vec16c const & a, Vec16c const & b) { return a + (Vec16c(f) & b); } // Horizontal add: Calculates the sum of all vector elements. // Overflow will wrap around static inline int32_t horizontal_add (Vec16c const & a) { __m128i sum1 = _mm_sad_epu8(a,_mm_setzero_si128()); __m128i sum2 = _mm_shuffle_epi32(sum1,2); __m128i sum3 = _mm_add_epi16(sum1,sum2); int8_t sum4 = (int8_t)_mm_cvtsi128_si32(sum3); // truncate to 8 bits return sum4; // sign extend to 32 bits } // Horizontal add extended: Calculates the sum of all vector elements. // Each element is sign-extended before addition to avoid overflow static inline int32_t horizontal_add_x (Vec16c const & a) { #ifdef __XOP__ // AMD XOP instruction set __m128i sum1 = _mm_haddq_epi8(a); __m128i sum2 = _mm_shuffle_epi32(sum1,0x0E); // high element __m128i sum3 = _mm_add_epi32(sum1,sum2); // sum return _mm_cvtsi128_si32(sum3); #elif INSTRSET >= 4 // SSSE3 __m128i aeven = _mm_slli_epi16(a,8); // even numbered elements of a. get sign bit in position aeven = _mm_srai_epi16(aeven,8); // sign extend even numbered elements __m128i aodd = _mm_srai_epi16(a,8); // sign extend odd numbered elements __m128i sum1 = _mm_add_epi16(aeven,aodd); // add even and odd elements __m128i sum2 = _mm_hadd_epi16(sum1,sum1); // horizontally add 8 elements in 3 steps __m128i sum3 = _mm_hadd_epi16(sum2,sum2); __m128i sum4 = _mm_hadd_epi16(sum3,sum3); int16_t sum5 = (int16_t)_mm_cvtsi128_si32(sum4); // 16 bit sum return sum5; // sign extend to 32 bits #else // SSE2 __m128i aeven = _mm_slli_epi16(a,8); // even numbered elements of a. get sign bit in position aeven = _mm_srai_epi16(aeven,8); // sign extend even numbered elements __m128i aodd = _mm_srai_epi16(a,8); // sign extend odd numbered elements __m128i sum1 = _mm_add_epi16(aeven,aodd); // add even and odd elements __m128i sum2 = _mm_shuffle_epi32(sum1,0x0E); // 4 high elements __m128i sum3 = _mm_add_epi16(sum1,sum2); // 4 sums __m128i sum4 = _mm_shuffle_epi32(sum3,0x01); // 2 high elements __m128i sum5 = _mm_add_epi16(sum3,sum4); // 2 sums __m128i sum6 = _mm_shufflelo_epi16(sum5,0x01); // 1 high element __m128i sum7 = _mm_add_epi16(sum5,sum6); // 1 sum int16_t sum8 = _mm_cvtsi128_si32(sum7); // 16 bit sum return sum8; // sign extend to 32 bits #endif } // function add_saturated: add element by element, signed with saturation static inline Vec16c add_saturated(Vec16c const & a, Vec16c const & b) { return _mm_adds_epi8(a, b); } // function sub_saturated: subtract element by element, signed with saturation static inline Vec16c sub_saturated(Vec16c const & a, Vec16c const & b) { return _mm_subs_epi8(a, b); } // function max: a > b ? a : b static inline Vec16c max(Vec16c const & a, Vec16c const & b) { #if INSTRSET >= 5 // SSE4.1 return _mm_max_epi8(a,b); #else // SSE2 __m128i signbit = _mm_set1_epi32(0x80808080); __m128i a1 = _mm_xor_si128(a,signbit); // add 0x80 __m128i b1 = _mm_xor_si128(b,signbit); // add 0x80 __m128i m1 = _mm_max_epu8(a1,b1); // unsigned max return _mm_xor_si128(m1,signbit); // sub 0x80 #endif } // function min: a < b ? a : b static inline Vec16c min(Vec16c const & a, Vec16c const & b) { #if INSTRSET >= 5 // SSE4.1 return _mm_min_epi8(a,b); #else // SSE2 __m128i signbit = _mm_set1_epi32(0x80808080); __m128i a1 = _mm_xor_si128(a,signbit); // add 0x80 __m128i b1 = _mm_xor_si128(b,signbit); // add 0x80 __m128i m1 = _mm_min_epu8(a1,b1); // unsigned min return _mm_xor_si128(m1,signbit); // sub 0x80 #endif } // function abs: a >= 0 ? a : -a static inline Vec16c abs(Vec16c const & a) { #if INSTRSET >= 4 // SSSE3 supported return _mm_sign_epi8(a,a); #else // SSE2 __m128i nega = _mm_sub_epi8(_mm_setzero_si128(), a); return _mm_min_epu8(a, nega); // unsigned min (the negative value is bigger when compared as unsigned) #endif } // function abs_saturated: same as abs, saturate if overflow static inline Vec16c abs_saturated(Vec16c const & a) { __m128i absa = abs(a); // abs(a) __m128i overfl = _mm_cmpgt_epi8(_mm_setzero_si128(),absa);// 0 > a return _mm_add_epi8(absa,overfl); // subtract 1 if 0x80 } // function rotate_left: rotate each element left by b bits // Use negative count to rotate right static inline Vec16c rotate_left(Vec16c const & a, int b) { #ifdef __XOP__ // AMD XOP instruction set return _mm_rot_epi8(a,_mm_set1_epi8(b)); #else // SSE2 instruction set __m128i bb = _mm_cvtsi32_si128(b & 7); // b modulo 8 __m128i mbb = _mm_cvtsi32_si128((8-b) & 7); // 8-b modulo 8 __m128i maskeven = _mm_set1_epi32(0x00FF00FF); // mask for even numbered bytes __m128i even = _mm_and_si128(a,maskeven); // even numbered bytes of a __m128i odd = _mm_andnot_si128(maskeven,a); // odd numbered bytes of a __m128i evenleft = _mm_sll_epi16(even,bb); // even bytes of a << b __m128i oddleft = _mm_sll_epi16(odd,bb); // odd bytes of a << b __m128i evenright = _mm_srl_epi16(even,mbb); // even bytes of a >> 8-b __m128i oddright = _mm_srl_epi16(odd,mbb); // odd bytes of a >> 8-b __m128i evenrot = _mm_or_si128(evenleft,evenright); // even bytes of a rotated __m128i oddrot = _mm_or_si128(oddleft,oddright); // odd bytes of a rotated __m128i allrot = selectb(maskeven,evenrot,oddrot); // all bytes rotated return allrot; #endif } /***************************************************************************** * * Vector of 16 8-bit unsigned integers * *****************************************************************************/ class Vec16uc : public Vec16c { public: // Default constructor: Vec16uc() { }; // Constructor to broadcast the same value into all elements: Vec16uc(uint32_t i) { xmm = _mm_set1_epi8((char)i); }; // Constructor to build from all elements: Vec16uc(uint8_t i0, uint8_t i1, uint8_t i2, uint8_t i3, uint8_t i4, uint8_t i5, uint8_t i6, uint8_t i7, uint8_t i8, uint8_t i9, uint8_t i10, uint8_t i11, uint8_t i12, uint8_t i13, uint8_t i14, uint8_t i15) { xmm = _mm_setr_epi8(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15); }; // Constructor to convert from type __m128i used in intrinsics: Vec16uc(__m128i const & x) { xmm = x; }; // Assignment operator to convert from type __m128i used in intrinsics: Vec16uc & operator = (__m128i const & x) { xmm = x; return *this; }; // Member function to load from array (unaligned) Vec16uc & load(void const * p) { xmm = _mm_loadu_si128((__m128i const*)p); return *this; } // Member function to load from array (aligned) Vec16uc & load_a(void const * p) { xmm = _mm_load_si128((__m128i const*)p); return *this; } // Member function to change a single element in vector // Note: This function is inefficient. Use load function if changing more than one element Vec16uc const & insert(uint32_t index, uint8_t value) { Vec16c::insert(index, value); return *this; } // Member function extract a single element from vector uint8_t extract(uint32_t index) const { return Vec16c::extract(index); } // Extract a single element. Use store function if extracting more than one element. // Operator [] can only read an element, not write. uint8_t operator [] (uint32_t index) const { return extract(index); } }; // Define operators for this class // vector operator << : shift left all elements static inline Vec16uc operator << (Vec16uc const & a, uint32_t b) { uint32_t mask = (uint32_t)0xFF >> (uint32_t)b; // mask to remove bits that are shifted out __m128i am = _mm_and_si128(a,_mm_set1_epi8((char)mask)); // remove bits that will overflow __m128i res = _mm_sll_epi16(am,_mm_cvtsi32_si128(b));// 16-bit shifts return res; } // vector operator << : shift left all elements static inline Vec16uc operator << (Vec16uc const & a, int32_t b) { return a << (uint32_t)b; } // vector operator >> : shift right logical all elements static inline Vec16uc operator >> (Vec16uc const & a, uint32_t b) { uint32_t mask = (uint32_t)0xFF << (uint32_t)b; // mask to remove bits that are shifted out __m128i am = _mm_and_si128(a,_mm_set1_epi8((char)mask)); // remove bits that will overflow __m128i res = _mm_srl_epi16(am,_mm_cvtsi32_si128(b));// 16-bit shifts return res; } // vector operator >> : shift right logical all elements static inline Vec16uc operator >> (Vec16uc const & a, int32_t b) { return a >> (uint32_t)b; } // vector operator >>= : shift right logical static inline Vec16uc & operator >>= (Vec16uc & a, int b) { a = a >> b; return a; } // vector operator >= : returns true for elements for which a >= b (unsigned) static inline Vec16cb operator >= (Vec16uc const & a, Vec16uc const & b) { #ifdef __XOP__ // AMD XOP instruction set return _mm_comge_epu8(a,b); #else // SSE2 instruction set return _mm_cmpeq_epi8(_mm_max_epu8(a,b),a); // a == max(a,b) #endif } // vector operator <= : returns true for elements for which a <= b (unsigned) static inline Vec16cb operator <= (Vec16uc const & a, Vec16uc const & b) { return b >= a; } // vector operator > : returns true for elements for which a > b (unsigned) static inline Vec16cb operator > (Vec16uc const & a, Vec16uc const & b) { #ifdef __XOP__ // AMD XOP instruction set return _mm_comgt_epu8(a,b); #else // SSE2 instruction set return Vec16cb(Vec16c(~(b >= a))); #endif } // vector operator < : returns true for elements for which a < b (unsigned) static inline Vec16cb operator < (Vec16uc const & a, Vec16uc const & b) { return b > a; } // vector operator + : add static inline Vec16uc operator + (Vec16uc const & a, Vec16uc const & b) { return Vec16uc (Vec16c(a) + Vec16c(b)); } // vector operator - : subtract static inline Vec16uc operator - (Vec16uc const & a, Vec16uc const & b) { return Vec16uc (Vec16c(a) - Vec16c(b)); } // vector operator * : multiply static inline Vec16uc operator * (Vec16uc const & a, Vec16uc const & b) { return Vec16uc (Vec16c(a) * Vec16c(b)); } // vector operator & : bitwise and static inline Vec16uc operator & (Vec16uc const & a, Vec16uc const & b) { return Vec16uc(Vec128b(a) & Vec128b(b)); } static inline Vec16uc operator && (Vec16uc const & a, Vec16uc const & b) { return a & b; } // vector operator | : bitwise or static inline Vec16uc operator | (Vec16uc const & a, Vec16uc const & b) { return Vec16uc(Vec128b(a) | Vec128b(b)); } static inline Vec16uc operator || (Vec16uc const & a, Vec16uc const & b) { return a | b; } // vector operator ^ : bitwise xor static inline Vec16uc operator ^ (Vec16uc const & a, Vec16uc const & b) { return Vec16uc(Vec128b(a) ^ Vec128b(b)); } // vector operator ~ : bitwise not static inline Vec16uc operator ~ (Vec16uc const & a) { return Vec16uc( ~ Vec128b(a)); } // Functions for this class // Select between two operands. Corresponds to this pseudocode: // for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i]; // Each byte in s must be either 0 (false) or -1 (true). No other values are allowed. // (s is signed) static inline Vec16uc select (Vec16cb const & s, Vec16uc const & a, Vec16uc const & b) { return selectb(s,a,b); } // Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i] static inline Vec16uc if_add (Vec16cb const & f, Vec16uc const & a, Vec16uc const & b) { return a + (Vec16uc(f) & b); } // Horizontal add: Calculates the sum of all vector elements. // Overflow will wrap around // (Note: horizontal_add_x(Vec16uc) is slightly faster) static inline uint32_t horizontal_add (Vec16uc const & a) { __m128i sum1 = _mm_sad_epu8(a,_mm_setzero_si128()); __m128i sum2 = _mm_shuffle_epi32(sum1,2); __m128i sum3 = _mm_add_epi16(sum1,sum2); uint16_t sum4 = (uint16_t)_mm_cvtsi128_si32(sum3); // truncate to 16 bits return sum4; } // Horizontal add extended: Calculates the sum of all vector elements. // Each element is zero-extended before addition to avoid overflow static inline uint32_t horizontal_add_x (Vec16uc const & a) { __m128i sum1 = _mm_sad_epu8(a,_mm_setzero_si128()); __m128i sum2 = _mm_shuffle_epi32(sum1,2); __m128i sum3 = _mm_add_epi16(sum1,sum2); return _mm_cvtsi128_si32(sum3); } // function add_saturated: add element by element, unsigned with saturation static inline Vec16uc add_saturated(Vec16uc const & a, Vec16uc const & b) { return _mm_adds_epu8(a, b); } // function sub_saturated: subtract element by element, unsigned with saturation static inline Vec16uc sub_saturated(Vec16uc const & a, Vec16uc const & b) { return _mm_subs_epu8(a, b); } // function max: a > b ? a : b static inline Vec16uc max(Vec16uc const & a, Vec16uc const & b) { return _mm_max_epu8(a,b); } // function min: a < b ? a : b static inline Vec16uc min(Vec16uc const & a, Vec16uc const & b) { return _mm_min_epu8(a,b); } /***************************************************************************** * * Vector of 8 16-bit signed integers * *****************************************************************************/ class Vec8s : public Vec128b { public: // Default constructor: Vec8s() { }; // Constructor to broadcast the same value into all elements: Vec8s(int i) { xmm = _mm_set1_epi16((int16_t)i); }; // Constructor to build from all elements: Vec8s(int16_t i0, int16_t i1, int16_t i2, int16_t i3, int16_t i4, int16_t i5, int16_t i6, int16_t i7) { xmm = _mm_setr_epi16(i0, i1, i2, i3, i4, i5, i6, i7); }; // Constructor to convert from type __m128i used in intrinsics: Vec8s(__m128i const & x) { xmm = x; }; // Assignment operator to convert from type __m128i used in intrinsics: Vec8s & operator = (__m128i const & x) { xmm = x; return *this; }; // Type cast operator to convert to __m128i used in intrinsics operator __m128i() const { return xmm; }; // Member function to load from array (unaligned) Vec8s & load(void const * p) { xmm = _mm_loadu_si128((__m128i const*)p); return *this; } // Member function to load from array (aligned) Vec8s & load_a(void const * p) { xmm = _mm_load_si128((__m128i const*)p); return *this; } // Partial load. Load n elements and set the rest to 0 Vec8s & load_partial(int n, void const * p) { if (n >= 8) load(p); else if (n <= 0) *this = 0; else if (((int)(intptr_t)p & 0xFFF) < 0xFF0) { // p is at least 16 bytes from a page boundary. OK to read 16 bytes load(p); } else { // worst case. read 1 byte at a time and suffer store forwarding penalty int16_t x[8]; for (int i = 0; i < n; i++) x[i] = ((int16_t *)p)[i]; load(x); } cutoff(n); return *this; } // Partial store. Store n elements void store_partial(int n, void * p) const { if (n >= 8) { store(p); return; } if (n <= 0) return; // we are not using _mm_maskmoveu_si128 because it is too slow on many processors union { int8_t c[16]; int16_t s[8]; int32_t i[4]; int64_t q[2]; } u; store(u.c); int j = 0; if (n & 4) { *(int64_t*)p = u.q[0]; j += 8; } if (n & 2) { ((int32_t*)p)[j/4] = u.i[j/4]; j += 4; } if (n & 1) { ((int16_t*)p)[j/2] = u.s[j/2]; } } // cut off vector to n elements. The last 8-n elements are set to zero Vec8s & cutoff(int n) { *this = Vec16c(xmm).cutoff(n * 2); return *this; } // Member function to change a single element in vector // Note: This function is inefficient. Use load function if changing more than one element Vec8s const & insert(uint32_t index, int16_t value) { switch(index) { case 0: xmm = _mm_insert_epi16(xmm,value,0); break; case 1: xmm = _mm_insert_epi16(xmm,value,1); break; case 2: xmm = _mm_insert_epi16(xmm,value,2); break; case 3: xmm = _mm_insert_epi16(xmm,value,3); break; case 4: xmm = _mm_insert_epi16(xmm,value,4); break; case 5: xmm = _mm_insert_epi16(xmm,value,5); break; case 6: xmm = _mm_insert_epi16(xmm,value,6); break; case 7: xmm = _mm_insert_epi16(xmm,value,7); break; } return *this; }; // Member function extract a single element from vector // Note: This function is inefficient. Use store function if extracting more than one element int16_t extract(uint32_t index) const { switch(index) { case 0: return (int16_t)_mm_extract_epi16(xmm,0); case 1: return (int16_t)_mm_extract_epi16(xmm,1); case 2: return (int16_t)_mm_extract_epi16(xmm,2); case 3: return (int16_t)_mm_extract_epi16(xmm,3); case 4: return (int16_t)_mm_extract_epi16(xmm,4); case 5: return (int16_t)_mm_extract_epi16(xmm,5); case 6: return (int16_t)_mm_extract_epi16(xmm,6); case 7: return (int16_t)_mm_extract_epi16(xmm,7); } return 0; } // Extract a single element. Use store function if extracting more than one element. // Operator [] can only read an element, not write. int16_t operator [] (uint32_t index) const { return extract(index); } static int size() { return 8; } }; /***************************************************************************** * * Vec8sb: Vector of 8 Booleans for use with Vec8s and Vec8us * *****************************************************************************/ class Vec8sb : public Vec8s { public: // Constructor to build from all elements: Vec8sb(bool x0, bool x1, bool x2, bool x3, bool x4, bool x5, bool x6, bool x7) { xmm = Vec8s(-int16_t(x0), -int16_t(x1), -int16_t(x2), -int16_t(x3), -int16_t(x4), -int16_t(x5), -int16_t(x6), -int16_t(x7)); } // Default constructor: Vec8sb() { } // Constructor to convert from type __m128i used in intrinsics: Vec8sb(__m128i const & x) { xmm = x; } // Assignment operator to convert from type __m128i used in intrinsics: Vec8sb & operator = (__m128i const & x) { xmm = x; return *this; } // Constructor to broadcast scalar value: Vec8sb(bool b) : Vec8s(-int16_t(b)) { } // Assignment operator to broadcast scalar value: Vec8sb & operator = (bool b) { *this = Vec8sb(b); return *this; } private: // Prevent constructing from int, etc. Vec8sb(int b); Vec8sb & operator = (int x); public: Vec8sb & insert (int index, bool a) { Vec8s::insert(index, -(int)a); return *this; } // Member function extract a single element from vector // Note: This function is inefficient. Use store function if extracting more than one element bool extract(uint32_t index) const { return Vec8s::extract(index) != 0; } // Extract a single element. Use store function if extracting more than one element. // Operator [] can only read an element, not write. bool operator [] (uint32_t index) const { return extract(index); } }; /***************************************************************************** * * Define operators for Vec8sb * *****************************************************************************/ // vector operator & : bitwise and static inline Vec8sb operator & (Vec8sb const & a, Vec8sb const & b) { return Vec8sb(Vec128b(a) & Vec128b(b)); } static inline Vec8sb operator && (Vec8sb const & a, Vec8sb const & b) { return a & b; } // vector operator &= : bitwise and static inline Vec8sb & operator &= (Vec8sb & a, Vec8sb const & b) { a = a & b; return a; } // vector operator | : bitwise or static inline Vec8sb operator | (Vec8sb const & a, Vec8sb const & b) { return Vec8sb(Vec128b(a) | Vec128b(b)); } static inline Vec8sb operator || (Vec8sb const & a, Vec8sb const & b) { return a | b; } // vector operator |= : bitwise or static inline Vec8sb & operator |= (Vec8sb & a, Vec8sb const & b) { a = a | b; return a; } // vector operator ^ : bitwise xor static inline Vec8sb operator ^ (Vec8sb const & a, Vec8sb const & b) { return Vec8sb(Vec128b(a) ^ Vec128b(b)); } // vector operator ^= : bitwise xor static inline Vec8sb & operator ^= (Vec8sb & a, Vec8sb const & b) { a = a ^ b; return a; } // vector operator ~ : bitwise not static inline Vec8sb operator ~ (Vec8sb const & a) { return Vec8sb( ~ Vec128b(a)); } // vector operator ! : element not static inline Vec8sb operator ! (Vec8sb const & a) { return ~ a; } // vector function andnot static inline Vec8sb andnot (Vec8sb const & a, Vec8sb const & b) { return Vec8sb(andnot(Vec128b(a), Vec128b(b))); } /***************************************************************************** * * operators for Vec8s * *****************************************************************************/ // vector operator + : add element by element static inline Vec8s operator + (Vec8s const & a, Vec8s const & b) { return _mm_add_epi16(a, b); } // vector operator += : add static inline Vec8s & operator += (Vec8s & a, Vec8s const & b) { a = a + b; return a; } // postfix operator ++ static inline Vec8s operator ++ (Vec8s & a, int) { Vec8s a0 = a; a = a + 1; return a0; } // prefix operator ++ static inline Vec8s & operator ++ (Vec8s & a) { a = a + 1; return a; } // vector operator - : subtract element by element static inline Vec8s operator - (Vec8s const & a, Vec8s const & b) { return _mm_sub_epi16(a, b); } // vector operator - : unary minus static inline Vec8s operator - (Vec8s const & a) { return _mm_sub_epi16(_mm_setzero_si128(), a); } // vector operator -= : subtract static inline Vec8s & operator -= (Vec8s & a, Vec8s const & b) { a = a - b; return a; } // postfix operator -- static inline Vec8s operator -- (Vec8s & a, int) { Vec8s a0 = a; a = a - 1; return a0; } // prefix operator -- static inline Vec8s & operator -- (Vec8s & a) { a = a - 1; return a; } // vector operator * : multiply element by element static inline Vec8s operator * (Vec8s const & a, Vec8s const & b) { return _mm_mullo_epi16(a, b); } // vector operator *= : multiply static inline Vec8s & operator *= (Vec8s & a, Vec8s const & b) { a = a * b; return a; } // vector operator / : divide all elements by same integer // See bottom of file // vector operator << : shift left static inline Vec8s operator << (Vec8s const & a, int b) { return _mm_sll_epi16(a,_mm_cvtsi32_si128(b)); } // vector operator <<= : shift left static inline Vec8s & operator <<= (Vec8s & a, int b) { a = a << b; return a; } // vector operator >> : shift right arithmetic static inline Vec8s operator >> (Vec8s const & a, int b) { return _mm_sra_epi16(a,_mm_cvtsi32_si128(b)); } // vector operator >>= : shift right arithmetic static inline Vec8s & operator >>= (Vec8s & a, int b) { a = a >> b; return a; } // vector operator == : returns true for elements for which a == b static inline Vec8s operator == (Vec8s const & a, Vec8s const & b) { return _mm_cmpeq_epi16(a, b); } // vector operator != : returns true for elements for which a != b static inline Vec8s operator != (Vec8s const & a, Vec8s const & b) { #ifdef __XOP__ // AMD XOP instruction set return _mm_comneq_epi16(a,b); #else // SSE2 instruction set return Vec8s (~(a == b)); #endif } // vector operator > : returns true for elements for which a > b static inline Vec8s operator > (Vec8s const & a, Vec8s const & b) { return _mm_cmpgt_epi16(a, b); } // vector operator < : returns true for elements for which a < b static inline Vec8s operator < (Vec8s const & a, Vec8s const & b) { return b > a; } // vector operator >= : returns true for elements for which a >= b (signed) static inline Vec8s operator >= (Vec8s const & a, Vec8s const & b) { #ifdef __XOP__ // AMD XOP instruction set return _mm_comge_epi16(a,b); #else // SSE2 instruction set return Vec8s (~(b > a)); #endif } // vector operator <= : returns true for elements for which a <= b (signed) static inline Vec8s operator <= (Vec8s const & a, Vec8s const & b) { return b >= a; } // vector operator & : bitwise and static inline Vec8s operator & (Vec8s const & a, Vec8s const & b) { return Vec8s(Vec128b(a) & Vec128b(b)); } static inline Vec8s operator && (Vec8s const & a, Vec8s const & b) { return a & b; } // vector operator &= : bitwise and static inline Vec8s & operator &= (Vec8s & a, Vec8s const & b) { a = a & b; return a; } // vector operator | : bitwise or static inline Vec8s operator | (Vec8s const & a, Vec8s const & b) { return Vec8s(Vec128b(a) | Vec128b(b)); } static inline Vec8s operator || (Vec8s const & a, Vec8s const & b) { return a | b; } // vector operator |= : bitwise or static inline Vec8s & operator |= (Vec8s & a, Vec8s const & b) { a = a | b; return a; } // vector operator ^ : bitwise xor static inline Vec8s operator ^ (Vec8s const & a, Vec8s const & b) { return Vec8s(Vec128b(a) ^ Vec128b(b)); } // vector operator ^= : bitwise xor static inline Vec8s & operator ^= (Vec8s & a, Vec8s const & b) { a = a ^ b; return a; } // vector operator ~ : bitwise not static inline Vec8s operator ~ (Vec8s const & a) { return Vec8s( ~ Vec128b(a)); } // vector operator ! : logical not, returns true for elements == 0 static inline Vec8s operator ! (Vec8s const & a) { return _mm_cmpeq_epi16(a,_mm_setzero_si128()); } // Functions for this class // Select between two operands. Corresponds to this pseudocode: // for (int i = 0; i < 8; i++) result[i] = s[i] ? a[i] : b[i]; // Each byte in s must be either 0 (false) or -1 (true). No other values are allowed. // (s is signed) static inline Vec8s select (Vec8s const & s, Vec8s const & a, Vec8s const & b) { return selectb(s,a,b); } // Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i] static inline Vec8s if_add (Vec8sb const & f, Vec8s const & a, Vec8s const & b) { return a + (Vec8s(f) & b); } // Horizontal add: Calculates the sum of all vector elements. // Overflow will wrap around static inline int32_t horizontal_add (Vec8s const & a) { #ifdef __XOP__ // AMD XOP instruction set __m128i sum1 = _mm_haddq_epi16(a); __m128i sum2 = _mm_shuffle_epi32(sum1,0x0E); // high element __m128i sum3 = _mm_add_epi32(sum1,sum2); // sum int16_t sum4 = _mm_cvtsi128_si32(sum3); // truncate to 16 bits return sum4; // sign extend to 32 bits #elif INSTRSET >= 4 // SSSE3 __m128i sum1 = _mm_hadd_epi16(a,a); // horizontally add 8 elements in 3 steps __m128i sum2 = _mm_hadd_epi16(sum1,sum1); __m128i sum3 = _mm_hadd_epi16(sum2,sum2); int16_t sum4 = (int16_t)_mm_cvtsi128_si32(sum3); // 16 bit sum return sum4; // sign extend to 32 bits #else // SSE2 __m128i sum1 = _mm_shuffle_epi32(a,0x0E); // 4 high elements __m128i sum2 = _mm_add_epi16(a,sum1); // 4 sums __m128i sum3 = _mm_shuffle_epi32(sum2,0x01); // 2 high elements __m128i sum4 = _mm_add_epi16(sum2,sum3); // 2 sums __m128i sum5 = _mm_shufflelo_epi16(sum4,0x01); // 1 high element __m128i sum6 = _mm_add_epi16(sum4,sum5); // 1 sum int16_t sum7 = _mm_cvtsi128_si32(sum6); // 16 bit sum return sum7; // sign extend to 32 bits #endif } // Horizontal add extended: Calculates the sum of all vector elements. // Elements are sign extended before adding to avoid overflow static inline int32_t horizontal_add_x (Vec8s const & a) { #ifdef __XOP__ // AMD XOP instruction set __m128i sum1 = _mm_haddq_epi16(a); __m128i sum2 = _mm_shuffle_epi32(sum1,0x0E); // high element __m128i sum3 = _mm_add_epi32(sum1,sum2); // sum return _mm_cvtsi128_si32(sum3); #elif INSTRSET >= 4 // SSSE3 __m128i aeven = _mm_slli_epi32(a,16); // even numbered elements of a. get sign bit in position aeven = _mm_srai_epi32(aeven,16); // sign extend even numbered elements __m128i aodd = _mm_srai_epi32(a,16); // sign extend odd numbered elements __m128i sum1 = _mm_add_epi32(aeven,aodd); // add even and odd elements __m128i sum2 = _mm_hadd_epi32(sum1,sum1); // horizontally add 4 elements in 2 steps __m128i sum3 = _mm_hadd_epi32(sum2,sum2); return _mm_cvtsi128_si32(sum3); #else // SSE2 __m128i aeven = _mm_slli_epi32(a,16); // even numbered elements of a. get sign bit in position aeven = _mm_srai_epi32(aeven,16); // sign extend even numbered elements __m128i aodd = _mm_srai_epi32(a,16); // sign extend odd numbered elements __m128i sum1 = _mm_add_epi32(aeven,aodd); // add even and odd elements __m128i sum2 = _mm_shuffle_epi32(sum1,0x0E); // 2 high elements __m128i sum3 = _mm_add_epi32(sum1,sum2); __m128i sum4 = _mm_shuffle_epi32(sum3,0x01); // 1 high elements __m128i sum5 = _mm_add_epi32(sum3,sum4); return _mm_cvtsi128_si32(sum5); // 32 bit sum #endif } // function add_saturated: add element by element, signed with saturation static inline Vec8s add_saturated(Vec8s const & a, Vec8s const & b) { return _mm_adds_epi16(a, b); } // function sub_saturated: subtract element by element, signed with saturation static inline Vec8s sub_saturated(Vec8s const & a, Vec8s const & b) { return _mm_subs_epi16(a, b); } // function max: a > b ? a : b static inline Vec8s max(Vec8s const & a, Vec8s const & b) { return _mm_max_epi16(a,b); } // function min: a < b ? a : b static inline Vec8s min(Vec8s const & a, Vec8s const & b) { return _mm_min_epi16(a,b); } // function abs: a >= 0 ? a : -a static inline Vec8s abs(Vec8s const & a) { #if INSTRSET >= 4 // SSSE3 supported return _mm_sign_epi16(a,a); #else // SSE2 __m128i nega = _mm_sub_epi16(_mm_setzero_si128(), a); return _mm_max_epi16(a, nega); #endif } // function abs_saturated: same as abs, saturate if overflow static inline Vec8s abs_saturated(Vec8s const & a) { __m128i absa = abs(a); // abs(a) __m128i overfl = _mm_srai_epi16(absa,15); // sign return _mm_add_epi16(absa,overfl); // subtract 1 if 0x8000 } // function rotate_left all elements // Use negative count to rotate right static inline Vec8s rotate_left(Vec8s const & a, int b) { #ifdef __XOP__ // AMD XOP instruction set return _mm_rot_epi16(a,_mm_set1_epi16(b)); #else // SSE2 instruction set __m128i left = _mm_sll_epi16(a,_mm_cvtsi32_si128(b & 0x0F)); // a << b __m128i right = _mm_srl_epi16(a,_mm_cvtsi32_si128((16-b) & 0x0F)); // a >> (16 - b) __m128i rot = _mm_or_si128(left,right); // or return rot; #endif } /***************************************************************************** * * Vector of 8 16-bit unsigned integers * *****************************************************************************/ class Vec8us : public Vec8s { public: // Default constructor: Vec8us() { }; // Constructor to broadcast the same value into all elements: Vec8us(uint32_t i) { xmm = _mm_set1_epi16((int16_t)i); }; // Constructor to build from all elements: Vec8us(uint16_t i0, uint16_t i1, uint16_t i2, uint16_t i3, uint16_t i4, uint16_t i5, uint16_t i6, uint16_t i7) { xmm = _mm_setr_epi16(i0, i1, i2, i3, i4, i5, i6, i7); }; // Constructor to convert from type __m128i used in intrinsics: Vec8us(__m128i const & x) { xmm = x; }; // Assignment operator to convert from type __m128i used in intrinsics: Vec8us & operator = (__m128i const & x) { xmm = x; return *this; }; // Member function to load from array (unaligned) Vec8us & load(void const * p) { xmm = _mm_loadu_si128((__m128i const*)p); return *this; } // Member function to load from array (aligned) Vec8us & load_a(void const * p) { xmm = _mm_load_si128((__m128i const*)p); return *this; } // Member function to change a single element in vector // Note: This function is inefficient. Use load function if changing more than one element Vec8us const & insert(uint32_t index, uint16_t value) { Vec8s::insert(index, value); return *this; }; // Member function extract a single element from vector uint16_t extract(uint32_t index) const { return Vec8s::extract(index); } // Extract a single element. Use store function if extracting more than one element. // Operator [] can only read an element, not write. uint16_t operator [] (uint32_t index) const { return extract(index); } }; // Define operators for this class // vector operator + : add static inline Vec8us operator + (Vec8us const & a, Vec8us const & b) { return Vec8us (Vec8s(a) + Vec8s(b)); } // vector operator - : subtract static inline Vec8us operator - (Vec8us const & a, Vec8us const & b) { return Vec8us (Vec8s(a) - Vec8s(b)); } // vector operator * : multiply static inline Vec8us operator * (Vec8us const & a, Vec8us const & b) { return Vec8us (Vec8s(a) * Vec8s(b)); } // vector operator / : divide // See bottom of file // vector operator >> : shift right logical all elements static inline Vec8us operator >> (Vec8us const & a, uint32_t b) { return _mm_srl_epi16(a,_mm_cvtsi32_si128(b)); } // vector operator >> : shift right logical all elements static inline Vec8us operator >> (Vec8us const & a, int32_t b) { return a >> (uint32_t)b; } // vector operator >>= : shift right logical static inline Vec8us & operator >>= (Vec8us & a, int b) { a = a >> b; return a; } // vector operator << : shift left all elements static inline Vec8us operator << (Vec8us const & a, uint32_t b) { return _mm_sll_epi16(a,_mm_cvtsi32_si128(b)); } // vector operator << : shift left all elements static inline Vec8us operator << (Vec8us const & a, int32_t b) { return a << (uint32_t)b; } // vector operator >= : returns true for elements for which a >= b (unsigned) static inline Vec8s operator >= (Vec8us const & a, Vec8us const & b) { #ifdef __XOP__ // AMD XOP instruction set return _mm_comge_epu16(a,b); #elif INSTRSET >= 5 // SSE4.1 __m128i max_ab = _mm_max_epu16(a,b); // max(a,b), unsigned return _mm_cmpeq_epi16(a,max_ab); // a == max(a,b) #else // SSE2 instruction set __m128i sub1 = _mm_sub_epi16(a,b); // a-b, wraparound __m128i sub2 = _mm_subs_epu16(a,b); // a-b, saturated return _mm_cmpeq_epi16(sub1,sub2); // sub1 == sub2 if no carry #endif } // vector operator <= : returns true for elements for which a <= b (unsigned) static inline Vec8s operator <= (Vec8us const & a, Vec8us const & b) { return b >= a; } // vector operator > : returns true for elements for which a > b (unsigned) static inline Vec8s operator > (Vec8us const & a, Vec8us const & b) { #ifdef __XOP__ // AMD XOP instruction set return _mm_comgt_epu16(a,b); #else // SSE2 instruction set return Vec8s (~(b >= a)); #endif } // vector operator < : returns true for elements for which a < b (unsigned) static inline Vec8s operator < (Vec8us const & a, Vec8us const & b) { return b > a; } // vector operator & : bitwise and static inline Vec8us operator & (Vec8us const & a, Vec8us const & b) { return Vec8us(Vec128b(a) & Vec128b(b)); } static inline Vec8us operator && (Vec8us const & a, Vec8us const & b) { return a & b; } // vector operator | : bitwise or static inline Vec8us operator | (Vec8us const & a, Vec8us const & b) { return Vec8us(Vec128b(a) | Vec128b(b)); } static inline Vec8us operator || (Vec8us const & a, Vec8us const & b) { return a | b; } // vector operator ^ : bitwise xor static inline Vec8us operator ^ (Vec8us const & a, Vec8us const & b) { return Vec8us(Vec128b(a) ^ Vec128b(b)); } // vector operator ~ : bitwise not static inline Vec8us operator ~ (Vec8us const & a) { return Vec8us( ~ Vec128b(a)); } // Functions for this class // Select between two operands. Corresponds to this pseudocode: // for (int i = 0; i < 8; i++) result[i] = s[i] ? a[i] : b[i]; // Each word in s must be either 0 (false) or -1 (true). No other values are allowed. // (s is signed) static inline Vec8us select (Vec8s const & s, Vec8us const & a, Vec8us const & b) { return selectb(s,a,b); } // Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i] static inline Vec8us if_add (Vec8sb const & f, Vec8us const & a, Vec8us const & b) { return a + (Vec8us(f) & b); } // Horizontal add: Calculates the sum of all vector elements. // Overflow will wrap around static inline uint32_t horizontal_add (Vec8us const & a) { #ifdef __XOP__ // AMD XOP instruction set __m128i sum1 = _mm_haddq_epu16(a); __m128i sum2 = _mm_shuffle_epi32(sum1,0x0E); // high element __m128i sum3 = _mm_add_epi32(sum1,sum2); // sum uint16_t sum4 = _mm_cvtsi128_si32(sum3); // truncate to 16 bits return sum4; // zero extend to 32 bits #elif INSTRSET >= 4 // SSSE3 __m128i sum1 = _mm_hadd_epi16(a,a); // horizontally add 8 elements in 3 steps __m128i sum2 = _mm_hadd_epi16(sum1,sum1); __m128i sum3 = _mm_hadd_epi16(sum2,sum2); uint16_t sum4 = (uint16_t)_mm_cvtsi128_si32(sum3); // 16 bit sum return sum4; // zero extend to 32 bits #else // SSE2 __m128i sum1 = _mm_shuffle_epi32(a,0x0E); // 4 high elements __m128i sum2 = _mm_add_epi16(a,sum1); // 4 sums __m128i sum3 = _mm_shuffle_epi32(sum2,0x01); // 2 high elements __m128i sum4 = _mm_add_epi16(sum2,sum3); // 2 sums __m128i sum5 = _mm_shufflelo_epi16(sum4,0x01); // 1 high element __m128i sum6 = _mm_add_epi16(sum4,sum5); // 1 sum uint16_t sum7 = _mm_cvtsi128_si32(sum6); // 16 bit sum return sum7; // zero extend to 32 bits #endif } // Horizontal add extended: Calculates the sum of all vector elements. // Each element is zero-extended before addition to avoid overflow static inline uint32_t horizontal_add_x (Vec8us const & a) { #ifdef __XOP__ // AMD XOP instruction set __m128i sum1 = _mm_haddq_epu16(a); __m128i sum2 = _mm_shuffle_epi32(sum1,0x0E); // high element __m128i sum3 = _mm_add_epi32(sum1,sum2); // sum return _mm_cvtsi128_si32(sum3); #elif INSTRSET >= 4 // SSSE3 __m128i mask = _mm_set1_epi32(0x0000FFFF); // mask for even positions __m128i aeven = _mm_and_si128(a,mask); // even numbered elements of a __m128i aodd = _mm_srli_epi32(a,16); // zero extend odd numbered elements __m128i sum1 = _mm_add_epi32(aeven,aodd); // add even and odd elements __m128i sum2 = _mm_hadd_epi32(sum1,sum1); // horizontally add 4 elements in 2 steps __m128i sum3 = _mm_hadd_epi32(sum2,sum2); return _mm_cvtsi128_si32(sum3); #else // SSE2 __m128i mask = _mm_set1_epi32(0x0000FFFF); // mask for even positions __m128i aeven = _mm_and_si128(a,mask); // even numbered elements of a __m128i aodd = _mm_srli_epi32(a,16); // zero extend odd numbered elements __m128i sum1 = _mm_add_epi32(aeven,aodd); // add even and odd elements __m128i sum2 = _mm_shuffle_epi32(sum1,0x0E); // 2 high elements __m128i sum3 = _mm_add_epi32(sum1,sum2); __m128i sum4 = _mm_shuffle_epi32(sum3,0x01); // 1 high elements __m128i sum5 = _mm_add_epi32(sum3,sum4); return _mm_cvtsi128_si32(sum5); // 16 bit sum #endif } // function add_saturated: add element by element, unsigned with saturation static inline Vec8us add_saturated(Vec8us const & a, Vec8us const & b) { return _mm_adds_epu16(a, b); } // function sub_saturated: subtract element by element, unsigned with saturation static inline Vec8us sub_saturated(Vec8us const & a, Vec8us const & b) { return _mm_subs_epu16(a, b); } // function max: a > b ? a : b static inline Vec8us max(Vec8us const & a, Vec8us const & b) { #if INSTRSET >= 5 // SSE4.1 return _mm_max_epu16(a,b); #else // SSE2 __m128i signbit = _mm_set1_epi32(0x80008000); __m128i a1 = _mm_xor_si128(a,signbit); // add 0x8000 __m128i b1 = _mm_xor_si128(b,signbit); // add 0x8000 __m128i m1 = _mm_max_epi16(a1,b1); // signed max return _mm_xor_si128(m1,signbit); // sub 0x8000 #endif } // function min: a < b ? a : b static inline Vec8us min(Vec8us const & a, Vec8us const & b) { #if INSTRSET >= 5 // SSE4.1 return _mm_min_epu16(a,b); #else // SSE2 __m128i signbit = _mm_set1_epi32(0x80008000); __m128i a1 = _mm_xor_si128(a,signbit); // add 0x8000 __m128i b1 = _mm_xor_si128(b,signbit); // add 0x8000 __m128i m1 = _mm_min_epi16(a1,b1); // signed min return _mm_xor_si128(m1,signbit); // sub 0x8000 #endif } /***************************************************************************** * * Vector of 4 32-bit signed integers * *****************************************************************************/ class Vec4i : public Vec128b { public: // Default constructor: Vec4i() { } // Constructor to broadcast the same value into all elements: Vec4i(int i) { xmm = _mm_set1_epi32(i); } // Constructor to build from all elements: Vec4i(int32_t i0, int32_t i1, int32_t i2, int32_t i3) { xmm = _mm_setr_epi32(i0, i1, i2, i3); } // Constructor to convert from type __m128i used in intrinsics: Vec4i(__m128i const & x) { xmm = x; } // Assignment operator to convert from type __m128i used in intrinsics: Vec4i & operator = (__m128i const & x) { xmm = x; return *this; } // Type cast operator to convert to __m128i used in intrinsics operator __m128i() const { return xmm; } // Member function to load from array (unaligned) Vec4i & load(void const * p) { xmm = _mm_loadu_si128((__m128i const*)p); return *this; } // Member function to load from array (aligned) Vec4i & load_a(void const * p) { xmm = _mm_load_si128((__m128i const*)p); return *this; } // Partial load. Load n elements and set the rest to 0 Vec4i & load_partial(int n, void const * p) { switch (n) { case 0: *this = 0; break; case 1: xmm = _mm_cvtsi32_si128(*(int32_t*)p); break; case 2: // intrinsic for movq is missing! xmm = _mm_setr_epi32(((int32_t*)p)[0], ((int32_t*)p)[1], 0, 0); break; case 3: xmm = _mm_setr_epi32(((int32_t*)p)[0], ((int32_t*)p)[1], ((int32_t*)p)[2], 0); break; case 4: load(p); break; default: break; } return *this; } // Partial store. Store n elements void store_partial(int n, void * p) const { union { int32_t i[4]; int64_t q[2]; } u; switch (n) { case 1: *(int32_t*)p = _mm_cvtsi128_si32(xmm); break; case 2: // intrinsic for movq is missing! store(u.i); *(int64_t*)p = u.q[0]; break; case 3: store(u.i); *(int64_t*)p = u.q[0]; ((int32_t*)p)[2] = u.i[2]; break; case 4: store(p); break; default: break; } } // cut off vector to n elements. The last 4-n elements are set to zero Vec4i & cutoff(int n) { *this = Vec16c(xmm).cutoff(n * 4); return *this; } // Member function to change a single element in vector // Note: This function is inefficient. Use load function if changing more than one element Vec4i const & insert(uint32_t index, int32_t value) { static const int32_t maskl[8] = {0,0,0,0,-1,0,0,0}; __m128i broad = _mm_set1_epi32(value); // broadcast value into all elements __m128i mask = _mm_loadu_si128((__m128i const*)(maskl+4-(index & 3))); // mask with FFFFFFFF at index position xmm = selectb(mask,broad,xmm); return *this; } // Member function extract a single element from vector int32_t extract(uint32_t index) const { int32_t x[4]; store(x); return x[index & 3]; } // Extract a single element. Use store function if extracting more than one element. // Operator [] can only read an element, not write. int32_t operator [] (uint32_t index) const { return extract(index); } static int size() { return 4; } }; /***************************************************************************** * * Vec4ib: Vector of 4 Booleans for use with Vec4i and Vec4ui * *****************************************************************************/ class Vec4ib : public Vec4i { public: // Default constructor: Vec4ib() { } // Constructor to build from all elements: Vec4ib(bool x0, bool x1, bool x2, bool x3) { xmm = Vec4i(-int32_t(x0), -int32_t(x1), -int32_t(x2), -int32_t(x3)); } // Constructor to convert from type __m128i used in intrinsics: Vec4ib(__m128i const & x) { xmm = x; } // Assignment operator to convert from type __m128i used in intrinsics: Vec4ib & operator = (__m128i const & x) { xmm = x; return *this; } // Constructor to broadcast scalar value: Vec4ib(bool b) : Vec4i(-int32_t(b)) { } // Assignment operator to broadcast scalar value: Vec4ib & operator = (bool b) { *this = Vec4ib(b); return *this; } private: // Prevent constructing from int, etc. Vec4ib(int b); Vec4ib & operator = (int x); public: Vec4ib & insert (int index, bool a) { Vec4i::insert(index, -(int)a); return *this; } // Member function extract a single element from vector bool extract(uint32_t index) const { return Vec4i::extract(index) != 0; } // Extract a single element. Use store function if extracting more than one element. // Operator [] can only read an element, not write. bool operator [] (uint32_t index) const { return extract(index); } }; /***************************************************************************** * * Define operators for Vec4ib * *****************************************************************************/ // vector operator & : bitwise and static inline Vec4ib operator & (Vec4ib const & a, Vec4ib const & b) { return Vec4ib(Vec128b(a) & Vec128b(b)); } static inline Vec4ib operator && (Vec4ib const & a, Vec4ib const & b) { return a & b; } // vector operator &= : bitwise and static inline Vec4ib & operator &= (Vec4ib & a, Vec4ib const & b) { a = a & b; return a; } // vector operator | : bitwise or static inline Vec4ib operator | (Vec4ib const & a, Vec4ib const & b) { return Vec4ib(Vec128b(a) | Vec128b(b)); } static inline Vec4ib operator || (Vec4ib const & a, Vec4ib const & b) { return a | b; } // vector operator |= : bitwise or static inline Vec4ib & operator |= (Vec4ib & a, Vec4ib const & b) { a = a | b; return a; } // vector operator ^ : bitwise xor static inline Vec4ib operator ^ (Vec4ib const & a, Vec4ib const & b) { return Vec4ib(Vec128b(a) ^ Vec128b(b)); } // vector operator ^= : bitwise xor static inline Vec4ib & operator ^= (Vec4ib & a, Vec4ib const & b) { a = a ^ b; return a; } // vector operator ~ : bitwise not static inline Vec4ib operator ~ (Vec4ib const & a) { return Vec4ib( ~ Vec128b(a)); } // vector operator ! : element not static inline Vec4ib operator ! (Vec4ib const & a) { return ~ a; } // vector function andnot static inline Vec4ib andnot (Vec4ib const & a, Vec4ib const & b) { return Vec4ib(andnot(Vec128b(a), Vec128b(b))); } /***************************************************************************** * * Operators for Vec4i * *****************************************************************************/ // vector operator + : add element by element static inline Vec4i operator + (Vec4i const & a, Vec4i const & b) { return _mm_add_epi32(a, b); } // vector operator += : add static inline Vec4i & operator += (Vec4i & a, Vec4i const & b) { a = a + b; return a; } // postfix operator ++ static inline Vec4i operator ++ (Vec4i & a, int) { Vec4i a0 = a; a = a + 1; return a0; } // prefix operator ++ static inline Vec4i & operator ++ (Vec4i & a) { a = a + 1; return a; } // vector operator - : subtract element by element static inline Vec4i operator - (Vec4i const & a, Vec4i const & b) { return _mm_sub_epi32(a, b); } // vector operator - : unary minus static inline Vec4i operator - (Vec4i const & a) { return _mm_sub_epi32(_mm_setzero_si128(), a); } // vector operator -= : subtract static inline Vec4i & operator -= (Vec4i & a, Vec4i const & b) { a = a - b; return a; } // postfix operator -- static inline Vec4i operator -- (Vec4i & a, int) { Vec4i a0 = a; a = a - 1; return a0; } // prefix operator -- static inline Vec4i & operator -- (Vec4i & a) { a = a - 1; return a; } // vector operator * : multiply element by element static inline Vec4i operator * (Vec4i const & a, Vec4i const & b) { #if INSTRSET >= 5 // SSE4.1 instruction set return _mm_mullo_epi32(a, b); #else __m128i a13 = _mm_shuffle_epi32(a, 0xF5); // (-,a3,-,a1) __m128i b13 = _mm_shuffle_epi32(b, 0xF5); // (-,b3,-,b1) __m128i prod02 = _mm_mul_epu32(a, b); // (-,a2*b2,-,a0*b0) __m128i prod13 = _mm_mul_epu32(a13, b13); // (-,a3*b3,-,a1*b1) __m128i prod01 = _mm_unpacklo_epi32(prod02,prod13); // (-,-,a1*b1,a0*b0) __m128i prod23 = _mm_unpackhi_epi32(prod02,prod13); // (-,-,a3*b3,a2*b2) return _mm_unpacklo_epi64(prod01,prod23); // (ab3,ab2,ab1,ab0) #endif } // vector operator *= : multiply static inline Vec4i & operator *= (Vec4i & a, Vec4i const & b) { a = a * b; return a; } // vector operator / : divide all elements by same integer // See bottom of file // vector operator << : shift left static inline Vec4i operator << (Vec4i const & a, int32_t b) { return _mm_sll_epi32(a,_mm_cvtsi32_si128(b)); } // vector operator <<= : shift left static inline Vec4i & operator <<= (Vec4i & a, int32_t b) { a = a << b; return a; } // vector operator >> : shift right arithmetic static inline Vec4i operator >> (Vec4i const & a, int32_t b) { return _mm_sra_epi32(a,_mm_cvtsi32_si128(b)); } // vector operator >>= : shift right arithmetic static inline Vec4i & operator >>= (Vec4i & a, int32_t b) { a = a >> b; return a; } // vector operator == : returns true for elements for which a == b static inline Vec4ib operator == (Vec4i const & a, Vec4i const & b) { return _mm_cmpeq_epi32(a, b); } // vector operator != : returns true for elements for which a != b static inline Vec4ib operator != (Vec4i const & a, Vec4i const & b) { #ifdef __XOP__ // AMD XOP instruction set return _mm_comneq_epi32(a,b); #else // SSE2 instruction set return Vec4ib(Vec4i (~(a == b))); #endif } // vector operator > : returns true for elements for which a > b static inline Vec4ib operator > (Vec4i const & a, Vec4i const & b) { return _mm_cmpgt_epi32(a, b); } // vector operator < : returns true for elements for which a < b static inline Vec4ib operator < (Vec4i const & a, Vec4i const & b) { return b > a; } // vector operator >= : returns true for elements for which a >= b (signed) static inline Vec4ib operator >= (Vec4i const & a, Vec4i const & b) { #ifdef __XOP__ // AMD XOP instruction set return _mm_comge_epi32(a,b); #else // SSE2 instruction set return Vec4ib(Vec4i (~(b > a))); #endif } // vector operator <= : returns true for elements for which a <= b (signed) static inline Vec4ib operator <= (Vec4i const & a, Vec4i const & b) { return b >= a; } // vector operator & : bitwise and static inline Vec4i operator & (Vec4i const & a, Vec4i const & b) { return Vec4i(Vec128b(a) & Vec128b(b)); } static inline Vec4i operator && (Vec4i const & a, Vec4i const & b) { return a & b; } // vector operator &= : bitwise and static inline Vec4i & operator &= (Vec4i & a, Vec4i const & b) { a = a & b; return a; } // vector operator | : bitwise or static inline Vec4i operator | (Vec4i const & a, Vec4i const & b) { return Vec4i(Vec128b(a) | Vec128b(b)); } static inline Vec4i operator || (Vec4i const & a, Vec4i const & b) { return a | b; } // vector operator |= : bitwise and static inline Vec4i & operator |= (Vec4i & a, Vec4i const & b) { a = a | b; return a; } // vector operator ^ : bitwise xor static inline Vec4i operator ^ (Vec4i const & a, Vec4i const & b) { return Vec4i(Vec128b(a) ^ Vec128b(b)); } // vector operator ^= : bitwise and static inline Vec4i & operator ^= (Vec4i & a, Vec4i const & b) { a = a ^ b; return a; } // vector operator ~ : bitwise not static inline Vec4i operator ~ (Vec4i const & a) { return Vec4i( ~ Vec128b(a)); } // vector operator ! : returns true for elements == 0 static inline Vec4ib operator ! (Vec4i const & a) { return _mm_cmpeq_epi32(a,_mm_setzero_si128()); } // Functions for this class // Select between two operands. Corresponds to this pseudocode: // for (int i = 0; i < 4; i++) result[i] = s[i] ? a[i] : b[i]; // Each byte in s must be either 0 (false) or -1 (true). No other values are allowed. // (s is signed) static inline Vec4i select (Vec4ib const & s, Vec4i const & a, Vec4i const & b) { return selectb(s,a,b); } // Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i] static inline Vec4i if_add (Vec4ib const & f, Vec4i const & a, Vec4i const & b) { return a + (Vec4i(f) & b); } // Horizontal add: Calculates the sum of all vector elements. // Overflow will wrap around static inline int32_t horizontal_add (Vec4i const & a) { #ifdef __XOP__ // AMD XOP instruction set __m128i sum1 = _mm_haddq_epi32(a); __m128i sum2 = _mm_shuffle_epi32(sum1,0x0E); // high element __m128i sum3 = _mm_add_epi32(sum1,sum2); // sum return _mm_cvtsi128_si32(sum3); // truncate to 32 bits #elif INSTRSET >= 4 // SSSE3 __m128i sum1 = _mm_hadd_epi32(a,a); // horizontally add 4 elements in 2 steps __m128i sum2 = _mm_hadd_epi32(sum1,sum1); return _mm_cvtsi128_si32(sum2); // 32 bit sum #else // SSE2 __m128i sum1 = _mm_shuffle_epi32(a,0x0E); // 2 high elements __m128i sum2 = _mm_add_epi32(a,sum1); // 2 sums __m128i sum3 = _mm_shuffle_epi32(sum2,0x01); // 1 high element __m128i sum4 = _mm_add_epi32(sum2,sum3); // 2 sums return _mm_cvtsi128_si32(sum4); // 32 bit sum #endif } // Horizontal add extended: Calculates the sum of all vector elements. // Elements are sign extended before adding to avoid overflow static inline int64_t horizontal_add_x (Vec4i const & a) { #ifdef __XOP__ // AMD XOP instruction set __m128i sum1 = _mm_haddq_epi32(a); #else // SSE2 __m128i signs = _mm_srai_epi32(a,31); // sign of all elements __m128i a01 = _mm_unpacklo_epi32(a,signs); // sign-extended a0, a1 __m128i a23 = _mm_unpackhi_epi32(a,signs); // sign-extended a2, a3 __m128i sum1 = _mm_add_epi64(a01,a23); // add #endif __m128i sum2 = _mm_unpackhi_epi64(sum1,sum1); // high qword __m128i sum3 = _mm_add_epi64(sum1,sum2); // add #if defined (__x86_64__) return _mm_cvtsi128_si64(sum3); // 64 bit mode #else union { __m128i x; // silly definition of _mm_storel_epi64 requires __m128i int64_t i; } u; _mm_storel_epi64(&u.x,sum3); return u.i; #endif } // function add_saturated: add element by element, signed with saturation static inline Vec4i add_saturated(Vec4i const & a, Vec4i const & b) { __m128i sum = _mm_add_epi32(a, b); // a + b __m128i axb = _mm_xor_si128(a, b); // check if a and b have different sign __m128i axs = _mm_xor_si128(a, sum); // check if a and sum have different sign __m128i overf1 = _mm_andnot_si128(axb,axs); // check if sum has wrong sign __m128i overf2 = _mm_srai_epi32(overf1,31); // -1 if overflow __m128i asign = _mm_srli_epi32(a,31); // 1 if a < 0 __m128i sat1 = _mm_srli_epi32(overf2,1); // 7FFFFFFF if overflow __m128i sat2 = _mm_add_epi32(sat1,asign); // 7FFFFFFF if positive overflow 80000000 if negative overflow return selectb(overf2,sat2,sum); // sum if not overflow, else sat2 } // function sub_saturated: subtract element by element, signed with saturation static inline Vec4i sub_saturated(Vec4i const & a, Vec4i const & b) { __m128i diff = _mm_sub_epi32(a, b); // a + b __m128i axb = _mm_xor_si128(a, b); // check if a and b have different sign __m128i axs = _mm_xor_si128(a, diff); // check if a and sum have different sign __m128i overf1 = _mm_and_si128(axb,axs); // check if sum has wrong sign __m128i overf2 = _mm_srai_epi32(overf1,31); // -1 if overflow __m128i asign = _mm_srli_epi32(a,31); // 1 if a < 0 __m128i sat1 = _mm_srli_epi32(overf2,1); // 7FFFFFFF if overflow __m128i sat2 = _mm_add_epi32(sat1,asign); // 7FFFFFFF if positive overflow 80000000 if negative overflow return selectb(overf2,sat2,diff); // diff if not overflow, else sat2 } // function max: a > b ? a : b static inline Vec4i max(Vec4i const & a, Vec4i const & b) { #if INSTRSET >= 5 // SSE4.1 supported return _mm_max_epi32(a,b); #else __m128i greater = _mm_cmpgt_epi32(a,b); return selectb(greater,a,b); #endif } // function min: a < b ? a : b static inline Vec4i min(Vec4i const & a, Vec4i const & b) { #if INSTRSET >= 5 // SSE4.1 supported return _mm_min_epi32(a,b); #else __m128i greater = _mm_cmpgt_epi32(a,b); return selectb(greater,b,a); #endif } // function abs: a >= 0 ? a : -a static inline Vec4i abs(Vec4i const & a) { #if INSTRSET >= 4 // SSSE3 supported return _mm_sign_epi32(a,a); #else // SSE2 __m128i sign = _mm_srai_epi32(a,31); // sign of a __m128i inv = _mm_xor_si128(a,sign); // invert bits if negative return _mm_sub_epi32(inv,sign); // add 1 #endif } // function abs_saturated: same as abs, saturate if overflow static inline Vec4i abs_saturated(Vec4i const & a) { __m128i absa = abs(a); // abs(a) __m128i overfl = _mm_srai_epi32(absa,31); // sign return _mm_add_epi32(absa,overfl); // subtract 1 if 0x80000000 } // function rotate_left all elements // Use negative count to rotate right static inline Vec4i rotate_left(Vec4i const & a, int b) { #ifdef __XOP__ // AMD XOP instruction set return _mm_rot_epi32(a,_mm_set1_epi32(b)); #else // SSE2 instruction set __m128i left = _mm_sll_epi32(a,_mm_cvtsi32_si128(b & 0x1F)); // a << b __m128i right = _mm_srl_epi32(a,_mm_cvtsi32_si128((32-b) & 0x1F)); // a >> (32 - b) __m128i rot = _mm_or_si128(left,right); // or return rot; #endif } /***************************************************************************** * * Vector of 4 32-bit unsigned integers * *****************************************************************************/ class Vec4ui : public Vec4i { public: // Default constructor: Vec4ui() { }; // Constructor to broadcast the same value into all elements: Vec4ui(uint32_t i) { xmm = _mm_set1_epi32(i); }; // Constructor to build from all elements: Vec4ui(uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3) { xmm = _mm_setr_epi32(i0, i1, i2, i3); }; // Constructor to convert from type __m128i used in intrinsics: Vec4ui(__m128i const & x) { xmm = x; }; // Assignment operator to convert from type __m128i used in intrinsics: Vec4ui & operator = (__m128i const & x) { xmm = x; return *this; }; // Member function to load from array (unaligned) Vec4ui & load(void const * p) { xmm = _mm_loadu_si128((__m128i const*)p); return *this; } // Member function to load from array (aligned) Vec4ui & load_a(void const * p) { xmm = _mm_load_si128((__m128i const*)p); return *this; } // Member function to change a single element in vector // Note: This function is inefficient. Use load function if changing more than one element Vec4ui const & insert(uint32_t index, uint32_t value) { Vec4i::insert(index, value); return *this; } // Member function extract a single element from vector uint32_t extract(uint32_t index) const { return Vec4i::extract(index); } // Extract a single element. Use store function if extracting more than one element. // Operator [] can only read an element, not write. uint32_t operator [] (uint32_t index) const { return extract(index); } }; // Define operators for this class // vector operator + : add static inline Vec4ui operator + (Vec4ui const & a, Vec4ui const & b) { return Vec4ui (Vec4i(a) + Vec4i(b)); } // vector operator - : subtract static inline Vec4ui operator - (Vec4ui const & a, Vec4ui const & b) { return Vec4ui (Vec4i(a) - Vec4i(b)); } // vector operator * : multiply static inline Vec4ui operator * (Vec4ui const & a, Vec4ui const & b) { return Vec4ui (Vec4i(a) * Vec4i(b)); } // vector operator / : divide // See bottom of file // vector operator >> : shift right logical all elements static inline Vec4ui operator >> (Vec4ui const & a, uint32_t b) { return _mm_srl_epi32(a,_mm_cvtsi32_si128(b)); } // vector operator >> : shift right logical all elements static inline Vec4ui operator >> (Vec4ui const & a, int32_t b) { return a >> (uint32_t)b; } // vector operator >>= : shift right logical static inline Vec4ui & operator >>= (Vec4ui & a, int b) { a = a >> b; return a; } // vector operator << : shift left all elements static inline Vec4ui operator << (Vec4ui const & a, uint32_t b) { return Vec4ui ((Vec4i)a << (int32_t)b); } // vector operator << : shift left all elements static inline Vec4ui operator << (Vec4ui const & a, int32_t b) { return Vec4ui ((Vec4i)a << (int32_t)b); } // vector operator > : returns true for elements for which a > b (unsigned) static inline Vec4ib operator > (Vec4ui const & a, Vec4ui const & b) { #ifdef __XOP__ // AMD XOP instruction set return _mm_comgt_epu32(a,b); #else // SSE2 instruction set __m128i signbit = _mm_set1_epi32(0x80000000); __m128i a1 = _mm_xor_si128(a,signbit); __m128i b1 = _mm_xor_si128(b,signbit); return _mm_cmpgt_epi32(a1,b1); // signed compare #endif } // vector operator < : returns true for elements for which a < b (unsigned) static inline Vec4ib operator < (Vec4ui const & a, Vec4ui const & b) { return b > a; } // vector operator >= : returns true for elements for which a >= b (unsigned) static inline Vec4ib operator >= (Vec4ui const & a, Vec4ui const & b) { #ifdef __XOP__ // AMD XOP instruction set return _mm_comge_epu32(a,b); #elif INSTRSET >= 5 // SSE4.1 __m128i max_ab = _mm_max_epu32(a,b); // max(a,b), unsigned return _mm_cmpeq_epi32(a,max_ab); // a == max(a,b) #else // SSE2 instruction set return Vec4ib(Vec4i (~(b > a))); #endif } // vector operator <= : returns true for elements for which a <= b (unsigned) static inline Vec4ib operator <= (Vec4ui const & a, Vec4ui const & b) { return b >= a; } // vector operator & : bitwise and static inline Vec4ui operator & (Vec4ui const & a, Vec4ui const & b) { return Vec4ui(Vec128b(a) & Vec128b(b)); } static inline Vec4ui operator && (Vec4ui const & a, Vec4ui const & b) { return a & b; } // vector operator | : bitwise or static inline Vec4ui operator | (Vec4ui const & a, Vec4ui const & b) { return Vec4ui(Vec128b(a) | Vec128b(b)); } static inline Vec4ui operator || (Vec4ui const & a, Vec4ui const & b) { return a | b; } // vector operator ^ : bitwise xor static inline Vec4ui operator ^ (Vec4ui const & a, Vec4ui const & b) { return Vec4ui(Vec128b(a) ^ Vec128b(b)); } // vector operator ~ : bitwise not static inline Vec4ui operator ~ (Vec4ui const & a) { return Vec4ui( ~ Vec128b(a)); } // Functions for this class // Select between two operands. Corresponds to this pseudocode: // for (int i = 0; i < 8; i++) result[i] = s[i] ? a[i] : b[i]; // Each word in s must be either 0 (false) or -1 (true). No other values are allowed. // (s is signed) static inline Vec4ui select (Vec4ib const & s, Vec4ui const & a, Vec4ui const & b) { return selectb(s,a,b); } // Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i] static inline Vec4ui if_add (Vec4ib const & f, Vec4ui const & a, Vec4ui const & b) { return a + (Vec4ui(f) & b); } // Horizontal add: Calculates the sum of all vector elements. // Overflow will wrap around static inline uint32_t horizontal_add (Vec4ui const & a) { return horizontal_add((Vec4i)a); } // Horizontal add extended: Calculates the sum of all vector elements. // Elements are zero extended before adding to avoid overflow static inline uint64_t horizontal_add_x (Vec4ui const & a) { #ifdef __XOP__ // AMD XOP instruction set __m128i sum1 = _mm_haddq_epu32(a); #else // SSE2 __m128i zero = _mm_setzero_si128(); // 0 __m128i a01 = _mm_unpacklo_epi32(a,zero); // zero-extended a0, a1 __m128i a23 = _mm_unpackhi_epi32(a,zero); // zero-extended a2, a3 __m128i sum1 = _mm_add_epi64(a01,a23); // add #endif __m128i sum2 = _mm_unpackhi_epi64(sum1,sum1); // high qword __m128i sum3 = _mm_add_epi64(sum1,sum2); // add #if defined(_M_AMD64) || defined(_M_X64) || defined(__x86_64__) || defined(__amd64) return _mm_cvtsi128_si64(sum3); // 64 bit mode #else union { __m128i x; // silly definition of _mm_storel_epi64 requires __m128i uint64_t i; } u; _mm_storel_epi64(&u.x,sum3); return u.i; #endif } // function add_saturated: add element by element, unsigned with saturation static inline Vec4ui add_saturated(Vec4ui const & a, Vec4ui const & b) { Vec4ui sum = a + b; Vec4ui aorb = Vec4ui(a | b); Vec4ui overflow = Vec4ui(sum < aorb); // overflow if a + b < (a | b) return Vec4ui (sum | overflow); // return 0xFFFFFFFF if overflow } // function sub_saturated: subtract element by element, unsigned with saturation static inline Vec4ui sub_saturated(Vec4ui const & a, Vec4ui const & b) { Vec4ui diff = a - b; Vec4ui underflow = Vec4ui(diff > a); // underflow if a - b > a return _mm_andnot_si128(underflow,diff); // return 0 if underflow } // function max: a > b ? a : b static inline Vec4ui max(Vec4ui const & a, Vec4ui const & b) { #if INSTRSET >= 5 // SSE4.1 return _mm_max_epu32(a,b); #else // SSE2 return select(a > b, a, b); #endif } // function min: a < b ? a : b static inline Vec4ui min(Vec4ui const & a, Vec4ui const & b) { #if INSTRSET >= 5 // SSE4.1 return _mm_min_epu32(a,b); #else // SSE2 return select(a > b, b, a); #endif } /***************************************************************************** * * Vector of 2 64-bit signed integers * *****************************************************************************/ class Vec2q : public Vec128b { public: // Default constructor: Vec2q() { } // Constructor to broadcast the same value into all elements: Vec2q(int64_t i) { #if defined (_MSC_VER) && ! defined(__INTEL_COMPILER) // MS compiler has no _mm_set1_epi64x in 32 bit mode #if defined(__x86_64__) // 64 bit mode #if _MSC_VER < 1700 __m128i x1 = _mm_cvtsi64_si128(i); // 64 bit load xmm = _mm_unpacklo_epi64(x1,x1); // broadcast #else xmm = _mm_set1_epi64x(i); #endif #else union { int64_t q[2]; int32_t r[4]; } u; u.q[0] = u.q[1] = i; xmm = _mm_setr_epi32(u.r[0], u.r[1], u.r[2], u.r[3]); /* // this will use an mm register and produce store forwarding stall: union { __m64 m; int64_t ii; } u; u.ii = i; xmm = _mm_set1_epi64(u.m); _m_empty(); */ #endif // __x86_64__ #else // Other compilers xmm = _mm_set1_epi64x(i); // emmintrin.h #endif } // Constructor to build from all elements: Vec2q(int64_t i0, int64_t i1) { #if defined (_MSC_VER) && ! defined(__INTEL_COMPILER) // MS compiler has no _mm_set_epi64x in 32 bit mode #if defined(__x86_64__) // 64 bit mode #if _MSC_VER < 1700 __m128i x0 = _mm_cvtsi64_si128(i0); // 64 bit load __m128i x1 = _mm_cvtsi64_si128(i1); // 64 bit load xmm = _mm_unpacklo_epi64(x0,x1); // combine #else xmm = _mm_set_epi64x(i1, i0); #endif #else // MS compiler in 32-bit mode union { int64_t q[2]; int32_t r[4]; } u; u.q[0] = i0; u.q[1] = i1; // this is inefficient, but other solutions are worse xmm = _mm_setr_epi32(u.r[0], u.r[1], u.r[2], u.r[3]); #endif // __x86_64__ #else // Other compilers xmm = _mm_set_epi64x(i1, i0); #endif } // Constructor to convert from type __m128i used in intrinsics: Vec2q(__m128i const & x) { xmm = x; } // Assignment operator to convert from type __m128i used in intrinsics: Vec2q & operator = (__m128i const & x) { xmm = x; return *this; } // Type cast operator to convert to __m128i used in intrinsics operator __m128i() const { return xmm; } // Member function to load from array (unaligned) Vec2q & load(void const * p) { xmm = _mm_loadu_si128((__m128i const*)p); return *this; } // Member function to load from array (aligned) Vec2q & load_a(void const * p) { xmm = _mm_load_si128((__m128i const*)p); return *this; } // Partial load. Load n elements and set the rest to 0 Vec2q & load_partial(int n, void const * p) { switch (n) { case 0: *this = 0; break; case 1: // intrinsic for movq is missing! *this = Vec2q(*(int64_t*)p, 0); break; case 2: load(p); break; default: break; } return *this; } // Partial store. Store n elements void store_partial(int n, void * p) const { switch (n) { case 1: int64_t q[2]; store(q); *(int64_t*)p = q[0]; break; case 2: store(p); break; default: break; } } // cut off vector to n elements. The last 2-n elements are set to zero Vec2q & cutoff(int n) { *this = Vec16c(xmm).cutoff(n * 8); return *this; } // Member function to change a single element in vector // Note: This function is inefficient. Use load function if changing more than one element Vec2q const & insert(uint32_t index, int64_t value) { #if INSTRSET >= 5 && defined(__x86_64__) // SSE4.1 supported, 64 bit mode if (index == 0) { xmm = _mm_insert_epi64(xmm,value,0); } else { xmm = _mm_insert_epi64(xmm,value,1); } #else // SSE2 #if defined(__x86_64__) // 64 bit mode __m128i v = _mm_cvtsi64_si128(value); // 64 bit load #else union { __m128i m; int64_t ii; } u; u.ii = value; __m128i v = _mm_loadl_epi64(&u.m); #endif if (index == 0) { v = _mm_unpacklo_epi64(v,v); xmm = _mm_unpackhi_epi64(v,xmm); } else { // index = 1 xmm = _mm_unpacklo_epi64(xmm,v); } #endif return *this; } // Member function extract a single element from vector int64_t extract(uint32_t index) const { int64_t x[2]; store(x); return x[index & 1]; } // Extract a single element. Use store function if extracting more than one element. // Operator [] can only read an element, not write. int64_t operator [] (uint32_t index) const { return extract(index); } static int size() { return 2; } }; /***************************************************************************** * * Vec2qb: Vector of 2 Booleans for use with Vec2q and Vec2uq * *****************************************************************************/ // Definition will be different for the AVX512 instruction set class Vec2qb : public Vec2q { public: // Default constructor: Vec2qb() { } // Constructor to build from all elements: Vec2qb(bool x0, bool x1) { xmm = Vec2q(-int64_t(x0), -int64_t(x1)); } // Constructor to convert from type __m128i used in intrinsics: Vec2qb(__m128i const & x) { xmm = x; } // Assignment operator to convert from type __m128i used in intrinsics: Vec2qb & operator = (__m128i const & x) { xmm = x; return *this; } // Constructor to broadcast scalar value: Vec2qb(bool b) : Vec2q(-int64_t(b)) { } // Assignment operator to broadcast scalar value: Vec2qb & operator = (bool b) { *this = Vec2qb(b); return *this; } private: // Prevent constructing from int, etc. Vec2qb(int b); Vec2qb & operator = (int x); public: Vec2qb & insert (int index, bool a) { Vec2q::insert(index, -(int64_t)a); return *this; } // Member function extract a single element from vector bool extract(uint32_t index) const { return Vec2q::extract(index) != 0; } // Extract a single element. Use store function if extracting more than one element. // Operator [] can only read an element, not write. bool operator [] (uint32_t index) const { return extract(index); } }; /***************************************************************************** * * Define operators for Vec2qb * *****************************************************************************/ // vector operator & : bitwise and static inline Vec2qb operator & (Vec2qb const & a, Vec2qb const & b) { return Vec2qb(Vec128b(a) & Vec128b(b)); } static inline Vec2qb operator && (Vec2qb const & a, Vec2qb const & b) { return a & b; } // vector operator &= : bitwise and static inline Vec2qb & operator &= (Vec2qb & a, Vec2qb const & b) { a = a & b; return a; } // vector operator | : bitwise or static inline Vec2qb operator | (Vec2qb const & a, Vec2qb const & b) { return Vec2qb(Vec128b(a) | Vec128b(b)); } static inline Vec2qb operator || (Vec2qb const & a, Vec2qb const & b) { return a | b; } // vector operator |= : bitwise or static inline Vec2qb & operator |= (Vec2qb & a, Vec2qb const & b) { a = a | b; return a; } // vector operator ^ : bitwise xor static inline Vec2qb operator ^ (Vec2qb const & a, Vec2qb const & b) { return Vec2qb(Vec128b(a) ^ Vec128b(b)); } // vector operator ^= : bitwise xor static inline Vec2qb & operator ^= (Vec2qb & a, Vec2qb const & b) { a = a ^ b; return a; } // vector operator ~ : bitwise not static inline Vec2qb operator ~ (Vec2qb const & a) { return Vec2qb( ~ Vec128b(a)); } // vector operator ! : element not static inline Vec2qb operator ! (Vec2qb const & a) { return ~ a; } // vector function andnot static inline Vec2qb andnot (Vec2qb const & a, Vec2qb const & b) { return Vec2qb(andnot(Vec128b(a), Vec128b(b))); } /***************************************************************************** * * Operators for Vec2q * *****************************************************************************/ // vector operator + : add element by element static inline Vec2q operator + (Vec2q const & a, Vec2q const & b) { return _mm_add_epi64(a, b); } // vector operator += : add static inline Vec2q & operator += (Vec2q & a, Vec2q const & b) { a = a + b; return a; } // postfix operator ++ static inline Vec2q operator ++ (Vec2q & a, int) { Vec2q a0 = a; a = a + 1; return a0; } // prefix operator ++ static inline Vec2q & operator ++ (Vec2q & a) { a = a + 1; return a; } // vector operator - : subtract element by element static inline Vec2q operator - (Vec2q const & a, Vec2q const & b) { return _mm_sub_epi64(a, b); } // vector operator - : unary minus static inline Vec2q operator - (Vec2q const & a) { return _mm_sub_epi64(_mm_setzero_si128(), a); } // vector operator -= : subtract static inline Vec2q & operator -= (Vec2q & a, Vec2q const & b) { a = a - b; return a; } // postfix operator -- static inline Vec2q operator -- (Vec2q & a, int) { Vec2q a0 = a; a = a - 1; return a0; } // prefix operator -- static inline Vec2q & operator -- (Vec2q & a) { a = a - 1; return a; } // vector operator * : multiply element by element static inline Vec2q operator * (Vec2q const & a, Vec2q const & b) { #if INSTRSET >= 5 // SSE4.1 supported // instruction does not exist. Split into 32-bit multiplies __m128i bswap = _mm_shuffle_epi32(b,0xB1); // b0H,b0L,b1H,b1L (swap H<->L) __m128i prodlh = _mm_mullo_epi32(a,bswap); // a0Lb0H,a0Hb0L,a1Lb1H,a1Hb1L, 32 bit L*H products __m128i zero = _mm_setzero_si128(); // 0 __m128i prodlh2 = _mm_hadd_epi32(prodlh,zero); // a0Lb0H+a0Hb0L,a1Lb1H+a1Hb1L,0,0 __m128i prodlh3 = _mm_shuffle_epi32(prodlh2,0x73); // 0, a0Lb0H+a0Hb0L, 0, a1Lb1H+a1Hb1L __m128i prodll = _mm_mul_epu32(a,b); // a0Lb0L,a1Lb1L, 64 bit unsigned products __m128i prod = _mm_add_epi64(prodll,prodlh3); // a0Lb0L+(a0Lb0H+a0Hb0L)<<32, a1Lb1L+(a1Lb1H+a1Hb1L)<<32 return prod; #else // SSE2 int64_t aa[2], bb[2]; a.store(aa); // split into elements b.store(bb); return Vec2q(aa[0]*bb[0], aa[1]*bb[1]); // multiply elements separetely #endif } // vector operator *= : multiply static inline Vec2q & operator *= (Vec2q & a, Vec2q const & b) { a = a * b; return a; } // vector operator << : shift left static inline Vec2q operator << (Vec2q const & a, int32_t b) { return _mm_sll_epi64(a,_mm_cvtsi32_si128(b)); } // vector operator <<= : shift left static inline Vec2q & operator <<= (Vec2q & a, int32_t b) { a = a << b; return a; } // vector operator >> : shift right arithmetic static inline Vec2q operator >> (Vec2q const & a, int32_t b) { // instruction does not exist. Split into 32-bit shifts if (b <= 32) { __m128i bb = _mm_cvtsi32_si128(b); // b __m128i sra = _mm_sra_epi32(a,bb); // a >> b signed dwords __m128i srl = _mm_srl_epi64(a,bb); // a >> b unsigned qwords __m128i mask = _mm_setr_epi32(0,-1,0,-1); // mask for signed high part return selectb(mask,sra,srl); } else { // b > 32 __m128i bm32 = _mm_cvtsi32_si128(b-32); // b - 32 __m128i sign = _mm_srai_epi32(a,31); // sign of a __m128i sra2 = _mm_sra_epi32(a,bm32); // a >> (b-32) signed dwords __m128i sra3 = _mm_srli_epi64(sra2,32); // a >> (b-32) >> 32 (second shift unsigned qword) __m128i mask = _mm_setr_epi32(0,-1,0,-1); // mask for high part containing only sign return selectb(mask,sign,sra3); } } // vector operator >>= : shift right arithmetic static inline Vec2q & operator >>= (Vec2q & a, int32_t b) { a = a >> b; return a; } // vector operator == : returns true for elements for which a == b static inline Vec2qb operator == (Vec2q const & a, Vec2q const & b) { #if INSTRSET >= 5 // SSE4.1 supported return _mm_cmpeq_epi64(a, b); #else // SSE2 // no 64 compare instruction. Do two 32 bit compares __m128i com32 = _mm_cmpeq_epi32(a,b); // 32 bit compares __m128i com32s = _mm_shuffle_epi32(com32,0xB1); // swap low and high dwords __m128i test = _mm_and_si128(com32,com32s); // low & high __m128i teste = _mm_srai_epi32(test,31); // extend sign bit to 32 bits __m128i testee = _mm_shuffle_epi32(teste,0xF5); // extend sign bit to 64 bits return Vec2qb(Vec2q(testee)); #endif } // vector operator != : returns true for elements for which a != b static inline Vec2qb operator != (Vec2q const & a, Vec2q const & b) { #ifdef __XOP__ // AMD XOP instruction set return Vec2q(_mm_comneq_epi64(a,b)); #else // SSE2 instruction set return Vec2qb(Vec2q(~(a == b))); #endif } // vector operator < : returns true for elements for which a < b static inline Vec2qb operator < (Vec2q const & a, Vec2q const & b) { #if INSTRSET >= 6 // SSE4.2 supported return Vec2qb(Vec2q(_mm_cmpgt_epi64(b, a))); #else // SSE2 // no 64 compare instruction. Subtract __m128i s = _mm_sub_epi64(a,b); // a-b // a < b if a and b have same sign and s < 0 or (a < 0 and b >= 0) // The latter () corrects for overflow __m128i axb = _mm_xor_si128(a,b); // a ^ b __m128i anb = _mm_andnot_si128(b,a); // a & ~b __m128i snaxb = _mm_andnot_si128(axb,s); // s & ~(a ^ b) __m128i or1 = _mm_or_si128(anb,snaxb); // (a & ~b) | (s & ~(a ^ b)) __m128i teste = _mm_srai_epi32(or1,31); // extend sign bit to 32 bits __m128i testee = _mm_shuffle_epi32(teste,0xF5); // extend sign bit to 64 bits return testee; #endif } // vector operator > : returns true for elements for which a > b static inline Vec2qb operator > (Vec2q const & a, Vec2q const & b) { return b < a; } // vector operator >= : returns true for elements for which a >= b (signed) static inline Vec2qb operator >= (Vec2q const & a, Vec2q const & b) { #ifdef __XOP__ // AMD XOP instruction set return Vec2q(_mm_comge_epi64(a,b)); #else // SSE2 instruction set return Vec2qb(Vec2q(~(a < b))); #endif } // vector operator <= : returns true for elements for which a <= b (signed) static inline Vec2qb operator <= (Vec2q const & a, Vec2q const & b) { return b >= a; } // vector operator & : bitwise and static inline Vec2q operator & (Vec2q const & a, Vec2q const & b) { return Vec2q(Vec128b(a) & Vec128b(b)); } static inline Vec2q operator && (Vec2q const & a, Vec2q const & b) { return a & b; } // vector operator &= : bitwise and static inline Vec2q & operator &= (Vec2q & a, Vec2q const & b) { a = a & b; return a; } // vector operator | : bitwise or static inline Vec2q operator | (Vec2q const & a, Vec2q const & b) { return Vec2q(Vec128b(a) | Vec128b(b)); } static inline Vec2q operator || (Vec2q const & a, Vec2q const & b) { return a | b; } // vector operator |= : bitwise or static inline Vec2q & operator |= (Vec2q & a, Vec2q const & b) { a = a | b; return a; } // vector operator ^ : bitwise xor static inline Vec2q operator ^ (Vec2q const & a, Vec2q const & b) { return Vec2q(Vec128b(a) ^ Vec128b(b)); } // vector operator ^= : bitwise xor static inline Vec2q & operator ^= (Vec2q & a, Vec2q const & b) { a = a ^ b; return a; } // vector operator ~ : bitwise not static inline Vec2q operator ~ (Vec2q const & a) { return Vec2q( ~ Vec128b(a)); } // vector operator ! : logical not, returns true for elements == 0 static inline Vec2qb operator ! (Vec2q const & a) { return a == Vec2q(_mm_setzero_si128()); } // Functions for this class // Select between two operands. Corresponds to this pseudocode: // for (int i = 0; i < 8; i++) result[i] = s[i] ? a[i] : b[i]; // Each byte in s must be either 0 (false) or -1 (true). No other values are allowed. // (s is signed) static inline Vec2q select (Vec2qb const & s, Vec2q const & a, Vec2q const & b) { return selectb(s,a,b); } // Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i] static inline Vec2q if_add (Vec2qb const & f, Vec2q const & a, Vec2q const & b) { return a + (Vec2q(f) & b); } // Horizontal add: Calculates the sum of all vector elements. // Overflow will wrap around static inline int64_t horizontal_add (Vec2q const & a) { __m128i sum1 = _mm_shuffle_epi32(a,0x0E); // high element __m128i sum2 = _mm_add_epi64(a,sum1); // sum #if defined(__x86_64__) return _mm_cvtsi128_si64(sum2); // 64 bit mode #else union { __m128i x; // silly definition of _mm_storel_epi64 requires __m128i int64_t i; } u; _mm_storel_epi64(&u.x,sum2); return u.i; #endif } // function max: a > b ? a : b static inline Vec2q max(Vec2q const & a, Vec2q const & b) { return select(a > b, a, b); } // function min: a < b ? a : b static inline Vec2q min(Vec2q const & a, Vec2q const & b) { return select(a < b, a, b); } // function abs: a >= 0 ? a : -a static inline Vec2q abs(Vec2q const & a) { #if INSTRSET >= 6 // SSE4.2 supported __m128i sign = _mm_cmpgt_epi64(_mm_setzero_si128(),a);// 0 > a #else // SSE2 __m128i signh = _mm_srai_epi32(a,31); // sign in high dword __m128i sign = _mm_shuffle_epi32(signh,0xF5); // copy sign to low dword #endif __m128i inv = _mm_xor_si128(a,sign); // invert bits if negative return _mm_sub_epi64(inv,sign); // add 1 } // function abs_saturated: same as abs, saturate if overflow static inline Vec2q abs_saturated(Vec2q const & a) { __m128i absa = abs(a); // abs(a) #if INSTRSET >= 6 // SSE4.2 supported __m128i overfl = _mm_cmpgt_epi64(_mm_setzero_si128(),absa);// 0 > a #else // SSE2 __m128i signh = _mm_srai_epi32(absa,31); // sign in high dword __m128i overfl= _mm_shuffle_epi32(signh,0xF5); // copy sign to low dword #endif return _mm_add_epi64(absa,overfl); // subtract 1 if 0x8000000000000000 } // function rotate_left all elements // Use negative count to rotate right static inline Vec2q rotate_left(Vec2q const & a, int b) { #ifdef __XOP__ // AMD XOP instruction set return _mm_rot_epi64(a,Vec2q(b)); #else // SSE2 instruction set __m128i left = _mm_sll_epi64(a,_mm_cvtsi32_si128(b & 0x3F)); // a << b __m128i right = _mm_srl_epi64(a,_mm_cvtsi32_si128((64-b) & 0x3F)); // a >> (64 - b) __m128i rot = _mm_or_si128(left,right); // or return rot; #endif } /***************************************************************************** * * Vector of 2 64-bit unsigned integers * *****************************************************************************/ class Vec2uq : public Vec2q { public: // Default constructor: Vec2uq() { }; // Constructor to broadcast the same value into all elements: Vec2uq(uint64_t i) { xmm = Vec2q(i); }; // Constructor to build from all elements: Vec2uq(uint64_t i0, uint64_t i1) { xmm = Vec2q(i0, i1); }; // Constructor to convert from type __m128i used in intrinsics: Vec2uq(__m128i const & x) { xmm = x; }; // Assignment operator to convert from type __m128i used in intrinsics: Vec2uq & operator = (__m128i const & x) { xmm = x; return *this; }; // Member function to load from array (unaligned) Vec2uq & load(void const * p) { xmm = _mm_loadu_si128((__m128i const*)p); return *this; } // Member function to load from array (aligned) Vec2uq & load_a(void const * p) { xmm = _mm_load_si128((__m128i const*)p); return *this; } // Member function to change a single element in vector // Note: This function is inefficient. Use load function if changing more than one element Vec2uq const & insert(uint32_t index, uint64_t value) { Vec2q::insert(index, value); return *this; } // Member function extract a single element from vector uint64_t extract(uint32_t index) const { return Vec2q::extract(index); } // Extract a single element. Use store function if extracting more than one element. // Operator [] can only read an element, not write. uint64_t operator [] (uint32_t index) const { return extract(index); } }; // Define operators for this class // vector operator + : add static inline Vec2uq operator + (Vec2uq const & a, Vec2uq const & b) { return Vec2uq (Vec2q(a) + Vec2q(b)); } // vector operator - : subtract static inline Vec2uq operator - (Vec2uq const & a, Vec2uq const & b) { return Vec2uq (Vec2q(a) - Vec2q(b)); } // vector operator * : multiply element by element static inline Vec2uq operator * (Vec2uq const & a, Vec2uq const & b) { return Vec2uq (Vec2q(a) * Vec2q(b)); } // vector operator >> : shift right logical all elements static inline Vec2uq operator >> (Vec2uq const & a, uint32_t b) { return _mm_srl_epi64(a,_mm_cvtsi32_si128(b)); } // vector operator >> : shift right logical all elements static inline Vec2uq operator >> (Vec2uq const & a, int32_t b) { return a >> (uint32_t)b; } // vector operator >>= : shift right logical static inline Vec2uq & operator >>= (Vec2uq & a, int b) { a = a >> b; return a; } // vector operator << : shift left all elements static inline Vec2uq operator << (Vec2uq const & a, uint32_t b) { return Vec2uq ((Vec2q)a << (int32_t)b); } // vector operator << : shift left all elements static inline Vec2uq operator << (Vec2uq const & a, int32_t b) { return Vec2uq ((Vec2q)a << b); } // vector operator > : returns true for elements for which a > b (unsigned) static inline Vec2qb operator > (Vec2uq const & a, Vec2uq const & b) { #ifdef __XOP__ // AMD XOP instruction set return Vec2q(_mm_comgt_epu64(a,b)); #else // SSE2 instruction set __m128i sign32 = _mm_set1_epi32(0x80000000); // sign bit of each dword __m128i aflip = _mm_xor_si128(a,sign32); // a with sign bits flipped __m128i bflip = _mm_xor_si128(b,sign32); // b with sign bits flipped __m128i equal = _mm_cmpeq_epi32(a,b); // a == b, dwords __m128i bigger = _mm_cmpgt_epi32(aflip,bflip); // a > b, dwords __m128i biggerl = _mm_shuffle_epi32(bigger,0xA0); // a > b, low dwords copied to high dwords __m128i eqbig = _mm_and_si128(equal,biggerl); // high part equal and low part bigger __m128i hibig = _mm_or_si128(bigger,eqbig); // high part bigger or high part equal and low part bigger __m128i big = _mm_shuffle_epi32(hibig,0xF5); // result copied to low part return Vec2qb(Vec2q(big)); #endif } // vector operator < : returns true for elements for which a < b (unsigned) static inline Vec2qb operator < (Vec2uq const & a, Vec2uq const & b) { return b > a; } // vector operator >= : returns true for elements for which a >= b (unsigned) static inline Vec2qb operator >= (Vec2uq const & a, Vec2uq const & b) { #ifdef __XOP__ // AMD XOP instruction set return Vec2q(_mm_comge_epu64(a,b)); #else // SSE2 instruction set return Vec2qb(Vec2q(~(b > a))); #endif } // vector operator <= : returns true for elements for which a <= b (unsigned) static inline Vec2qb operator <= (Vec2uq const & a, Vec2uq const & b) { return b >= a; } // vector operator & : bitwise and static inline Vec2uq operator & (Vec2uq const & a, Vec2uq const & b) { return Vec2uq(Vec128b(a) & Vec128b(b)); } static inline Vec2uq operator && (Vec2uq const & a, Vec2uq const & b) { return a & b; } // vector operator | : bitwise or static inline Vec2uq operator | (Vec2uq const & a, Vec2uq const & b) { return Vec2uq(Vec128b(a) | Vec128b(b)); } static inline Vec2uq operator || (Vec2uq const & a, Vec2uq const & b) { return a | b; } // vector operator ^ : bitwise xor static inline Vec2uq operator ^ (Vec2uq const & a, Vec2uq const & b) { return Vec2uq(Vec128b(a) ^ Vec128b(b)); } // vector operator ~ : bitwise not static inline Vec2uq operator ~ (Vec2uq const & a) { return Vec2uq( ~ Vec128b(a)); } // Functions for this class // Select between two operands. Corresponds to this pseudocode: // for (int i = 0; i < 2; i++) result[i] = s[i] ? a[i] : b[i]; // Each word in s must be either 0 (false) or -1 (true). No other values are allowed. // (s is signed) static inline Vec2uq select (Vec2qb const & s, Vec2uq const & a, Vec2uq const & b) { return selectb(s,a,b); } // Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i] static inline Vec2uq if_add (Vec2qb const & f, Vec2uq const & a, Vec2uq const & b) { return a + (Vec2uq(f) & b); } // Horizontal add: Calculates the sum of all vector elements. // Overflow will wrap around static inline uint64_t horizontal_add (Vec2uq const & a) { return horizontal_add((Vec2q)a); } // function max: a > b ? a : b static inline Vec2uq max(Vec2uq const & a, Vec2uq const & b) { return select(a > b, a, b); } // function min: a < b ? a : b static inline Vec2uq min(Vec2uq const & a, Vec2uq const & b) { return select(a > b, b, a); } /***************************************************************************** * * Vector permute functions * ****************************************************************************** * * These permute functions can reorder the elements of a vector and optionally * set some elements to zero. * * The indexes are inserted as template parameters in <>. These indexes must be * constants. Each template parameter is an index to the element you want to * select. A negative index will generate zero. an index of -256 means don't care. * * Example: * Vec4i a(10,11,12,13); // a is (10,11,12,13) * Vec4i b, c; * b = permute4i<0,0,2,2>(a); // b is (10,10,12,12) * c = permute4i<3,2,-1,-1>(a); // c is (13,12, 0, 0) * * The permute functions for vectors of 8-bit integers are inefficient if * the SSSE3 instruction set or later is not enabled. * * A lot of the code here is metaprogramming aiming to find the instructions * that best fit the template parameters and instruction set. The metacode * will be reduced out to leave only a few vector instructions in release * mode with optimization on. *****************************************************************************/ template static inline Vec2q permute2q(Vec2q const & a) { if (i0 == 0) { if (i1 == 0) { // 0,0 return _mm_unpacklo_epi64(a, a); } else if (i1 == 1 || i1 == -0x100) { // 0,1 return a; } else { // 0,-1 // return _mm_mov_epi64(a); // doesn't work with MS VS 2008 return _mm_and_si128(a, constant4i<-1,-1,0,0>()); } } else if (i0 == 1) { if (i1 == 0) { // 1,0 return _mm_shuffle_epi32(a, 0x4E); } else if (i1 == 1) { // 1,1 return _mm_unpackhi_epi64(a, a); } else { // 1,-1 return _mm_srli_si128(a, 8); } } else { // i0 < 0 if (i1 == 0) { // -1,0 return _mm_slli_si128(a, 8); } else if (i1 == 1) { // -1,1 if (i0 == -0x100) return a; return _mm_and_si128(a, constant4i<0,0,-1,-1>()); } else { // -1,-1 return _mm_setzero_si128(); } } } template static inline Vec2uq permute2uq(Vec2uq const & a) { return Vec2uq (permute2q ((__m128i)a)); } // permute vector Vec4i template static inline Vec4i permute4i(Vec4i const & a) { // Combine all the indexes into a single bitfield, with 4 bits for each const int m1 = (i0&3) | (i1&3)<<4 | (i2&3)<<8 | (i3&3)<<12; // Mask to zero out negative indexes const int mz = (i0<0?0:0xF) | (i1<0?0:0xF)<<4 | (i2<0?0:0xF)<<8 | (i3<0?0:0xF)<<12; // Mask indicating required zeroing of all indexes, with 4 bits for each, 0 for index = -1, 0xF for index >= 0 or -256 const int ssz = ((i0 & 0x80) ? 0 : 0xF) | ((i1 & 0x80) ? 0 : 0xF) << 4 | ((i2 & 0x80) ? 0 : 0xF) << 8 | ((i3 & 0x80) ? 0 : 0xF) << 12; // Mask indicating 0 for don't care, 0xF for non-negative value of required zeroing const int md = mz | ~ ssz; // Test if permutation needed const bool do_shuffle = ((m1 ^ 0x00003210) & mz) != 0; // is zeroing needed const bool do_zero = (ssz != 0xFFFF); if (mz == 0) { return _mm_setzero_si128(); // special case: all zero or don't care } // Test if we can do with 64-bit permute only if ((m1 & 0x0101 & mz) == 0 // even indexes are even or negative && (~m1 & 0x1010 & mz) == 0 // odd indexes are odd or negative && ((m1 ^ ((m1 + 0x0101) << 4)) & 0xF0F0 & mz & (mz << 4)) == 0 // odd index == preceding even index +1 or at least one of them negative && ((mz ^ (mz << 4)) & 0xF0F0 & md & md << 4) == 0) { // each pair of indexes are both negative or both positive or one of them don't care const int j0 = i0 >= 0 ? i0 / 2 : (i0 & 0x80) ? i0 : i1 >= 0 ? i1/2 : i1; const int j1 = i2 >= 0 ? i2 / 2 : (i2 & 0x80) ? i2 : i3 >= 0 ? i3/2 : i3; return Vec4i(permute2q (Vec2q(a))); // 64 bit permute } #if INSTRSET >= 4 // SSSE3 if (do_shuffle && do_zero) { // With SSSE3 we can do both with the PSHUFB instruction const int j0 = (i0 & 3) << 2; const int j1 = (i1 & 3) << 2; const int j2 = (i2 & 3) << 2; const int j3 = (i3 & 3) << 2; __m128i mask1 = constant4i < i0 < 0 ? -1 : j0 | (j0+1)<<8 | (j0+2)<<16 | (j0+3) << 24, i1 < 0 ? -1 : j1 | (j1+1)<<8 | (j1+2)<<16 | (j1+3) << 24, i2 < 0 ? -1 : j2 | (j2+1)<<8 | (j2+2)<<16 | (j2+3) << 24, i3 < 0 ? -1 : j3 | (j3+1)<<8 | (j3+2)<<16 | (j3+3) << 24 > (); return _mm_shuffle_epi8(a,mask1); } #endif __m128i t1; if (do_shuffle) { // permute t1 = _mm_shuffle_epi32(a, (i0&3) | (i1&3)<<2 | (i2&3)<<4 | (i3&3)<<6); } else { t1 = a; } if (do_zero) { // set some elements to zero __m128i mask2 = constant4i< -int(i0>=0), -int(i1>=0), -int(i2>=0), -int(i3>=0) >(); t1 = _mm_and_si128(t1,mask2); } return t1; } template static inline Vec4ui permute4ui(Vec4ui const & a) { return Vec4ui (permute4i (a)); } template static inline Vec8s permute8s(Vec8s const & a) { if ((i0 & i1 & i2 & i3 & i4 & i5 & i6 & i7) < 0) { return _mm_setzero_si128(); // special case: all zero } #if INSTRSET >= 4 // SSSE3 // special case: rotate if (i0>=0 && i0 < 8 && i1==((i0+1)&7) && i2==((i0+2)&7) && i3==((i0+3)&7) && i4==((i0+4)&7) && i5==((i0+5)&7) && i6==((i0+6)&7) && i7==((i0+7)&7)) { if (i0 == 0) return a; // do nothing return _mm_alignr_epi8(a, a, (i0 & 7) * 2); } // General case: Use PSHUFB const int j0 = i0 < 0 ? 0xFFFF : ( (i0 & 7) * 2 | ((i0 & 7) * 2 + 1) << 8 ); const int j1 = i1 < 0 ? 0xFFFF : ( (i1 & 7) * 2 | ((i1 & 7) * 2 + 1) << 8 ); const int j2 = i2 < 0 ? 0xFFFF : ( (i2 & 7) * 2 | ((i2 & 7) * 2 + 1) << 8 ); const int j3 = i3 < 0 ? 0xFFFF : ( (i3 & 7) * 2 | ((i3 & 7) * 2 + 1) << 8 ); const int j4 = i4 < 0 ? 0xFFFF : ( (i4 & 7) * 2 | ((i4 & 7) * 2 + 1) << 8 ); const int j5 = i5 < 0 ? 0xFFFF : ( (i5 & 7) * 2 | ((i5 & 7) * 2 + 1) << 8 ); const int j6 = i6 < 0 ? 0xFFFF : ( (i6 & 7) * 2 | ((i6 & 7) * 2 + 1) << 8 ); const int j7 = i7 < 0 ? 0xFFFF : ( (i7 & 7) * 2 | ((i7 & 7) * 2 + 1) << 8 ); __m128i mask = constant4i < j0 | j1 << 16, j2 | j3 << 16, j4 | j5 << 16, j6 | j7 << 16 > (); return _mm_shuffle_epi8(a,mask); #else // SSE2 has no simple solution. Find the optimal permute method. // Without proper metaprogramming features, we have to use constant expressions // and if-statements to make sure these calculations are resolved at compile time. // All this should produce at most 8 instructions in the final code, depending // on the template parameters. // Temporary vectors __m128i t1, t2, t3, t4, t5, t6, t7; // Combine all the indexes into a single bitfield, with 4 bits for each const int m1 = (i0&7) | (i1&7)<<4 | (i2&7)<<8 | (i3&7)<<12 | (i4&7)<<16 | (i5&7)<<20 | (i6&7)<<24 | (i7&7)<<28; // Mask to zero out negative indexes const int m2 = (i0<0?0:0xF) | (i1<0?0:0xF)<<4 | (i2<0?0:0xF)<<8 | (i3<0?0:0xF)<<12 | (i4<0?0:0xF)<<16 | (i5<0?0:0xF)<<20 | (i6<0?0:0xF)<<24 | (i7<0?0:0xF)<<28; // Test if we can do without permute const bool case0 = ((m1 ^ 0x76543210) & m2) == 0; // all indexes point to their own place or negative // Test if we can do with 32-bit permute only const bool case1 = (m1 & 0x01010101 & m2) == 0 // even indexes are even or negative && (~m1 & 0x10101010 & m2) == 0 // odd indexes are odd or negative && ((m1 ^ ((m1 + 0x01010101) << 4)) & 0xF0F0F0F0 & m2 & (m2 << 4)) == 0; // odd index == preceding even index +1 or at least one of them negative // Test if we can do with 16-bit permute only const bool case2 = (((m1 & 0x44444444) ^ 0x44440000) & m2) == 0; // indexes 0-3 point to lower 64 bits, 1-7 to higher 64 bits, or negative if (case0) { // no permute needed t7 = a; } else if (case1) { // 32 bit permute only const int j0 = i0 >= 0 ? i0/2 : i1 >= 0 ? i1/2 : 0; const int j1 = i2 >= 0 ? i2/2 : i3 >= 0 ? i3/2 : 0; const int j2 = i4 >= 0 ? i4/2 : i5 >= 0 ? i5/2 : 0; const int j3 = i6 >= 0 ? i6/2 : i7 >= 0 ? i7/2 : 0; t7 = _mm_shuffle_epi32(a, (j0&3) | (j1&3)<<2 | (j2&3)<<4 | (j3&3)<<6 ); } else if (case2) { // 16 bit permute only const int j0 = i0 >= 0 ? i0&3 : 0; const int j1 = i1 >= 0 ? i1&3 : 1; const int j2 = i2 >= 0 ? i2&3 : 2; const int j3 = i3 >= 0 ? i3&3 : 3; const int j4 = i4 >= 0 ? i4&3 : 0; const int j5 = i5 >= 0 ? i5&3 : 1; const int j6 = i6 >= 0 ? i6&3 : 2; const int j7 = i7 >= 0 ? i7&3 : 3; if (j0!=0 || j1!=1 || j2!=2 || j3!=3) { t1 = _mm_shufflelo_epi16(a, j0 | j1<<2 | j2<<4 | j3<<6); } else t1 = a; if (j4!=0 || j5!=1 || j6!=2 || j7!=3) { t7 = _mm_shufflehi_epi16(t1, j4 | j5<<2 | j6<<4 | j7<<6); } else t7 = t1; } else { // Need at least two permute steps // Index to where each dword of a is needed const int nn = (m1 & 0x66666666) | 0x88888888; // indicate which dwords are needed const int n0 = ((((uint32_t)(nn ^ 0x00000000) - 0x22222222) & 0x88888888) ^ 0x88888888) & m2; const int n1 = ((((uint32_t)(nn ^ 0x22222222) - 0x22222222) & 0x88888888) ^ 0x88888888) & m2; const int n2 = ((((uint32_t)(nn ^ 0x44444444) - 0x22222222) & 0x88888888) ^ 0x88888888) & m2; const int n3 = ((((uint32_t)(nn ^ 0x66666666) - 0x22222222) & 0x88888888) ^ 0x88888888) & m2; // indicate which dwords are needed in low half const int l0 = (n0 & 0xFFFF) != 0; const int l1 = (n1 & 0xFFFF) != 0; const int l2 = (n2 & 0xFFFF) != 0; const int l3 = (n3 & 0xFFFF) != 0; // indicate which dwords are needed in high half const int h0 = (n0 & 0xFFFF0000) != 0; const int h1 = (n1 & 0xFFFF0000) != 0; const int h2 = (n2 & 0xFFFF0000) != 0; const int h3 = (n3 & 0xFFFF0000) != 0; // Test if we can do with two permute steps const bool case3 = l0 + l1 + l2 + l3 <= 2 && h0 + h1 + h2 + h3 <= 2; if (case3) { // one 32-bit permute followed by one 16-bit permute in each half. // Find permute indices for 32-bit permute const int j0 = l0 ? 0 : l1 ? 1 : l2 ? 2 : 3; const int j1 = l3 ? 3 : l2 ? 2 : l1 ? 1 : 0; const int j2 = h0 ? 0 : h1 ? 1 : h2 ? 2 : 3; const int j3 = h3 ? 3 : h2 ? 2 : h1 ? 1 : 0; // Find permute indices for low 16-bit permute const int r0 = i0 < 0 ? 0 : (i0>>1 == j0 ? 0 : 2) + (i0 & 1); const int r1 = i1 < 0 ? 1 : (i1>>1 == j0 ? 0 : 2) + (i1 & 1); const int r2 = i2 < 0 ? 2 : (i2>>1 == j1 ? 2 : 0) + (i2 & 1); const int r3 = i3 < 0 ? 3 : (i3>>1 == j1 ? 2 : 0) + (i3 & 1); // Find permute indices for high 16-bit permute const int s0 = i4 < 0 ? 0 : (i4>>1 == j2 ? 0 : 2) + (i4 & 1); const int s1 = i5 < 0 ? 1 : (i5>>1 == j2 ? 0 : 2) + (i5 & 1); const int s2 = i6 < 0 ? 2 : (i6>>1 == j3 ? 2 : 0) + (i6 & 1); const int s3 = i7 < 0 ? 3 : (i7>>1 == j3 ? 2 : 0) + (i7 & 1); // 32-bit permute t1 = _mm_shuffle_epi32 (a, j0 | j1<<2 | j2<<4 | j3<<6); // 16-bit permutes if (r0!=0 || r1!=1 || r2!=2 || r3!=3) { // 16 bit permute of low half t2 = _mm_shufflelo_epi16(t1, r0 | r1<<2 | r2<<4 | r3<<6); } else t2 = t1; if (s0!=0 || s1!=1 || s2!=2 || s3!=3) { // 16 bit permute of high half t7 = _mm_shufflehi_epi16(t2, s0 | s1<<2 | s2<<4 | s3<<6); } else t7 = t2; } else { // Worst case. We need two sets of 16-bit permutes t1 = _mm_shuffle_epi32(a, 0x4E); // swap low and high 64-bits // Find permute indices for low 16-bit permute from swapped t1 const int r0 = i0 < 4 ? 0 : i0 & 3; const int r1 = i1 < 4 ? 1 : i1 & 3; const int r2 = i2 < 4 ? 2 : i2 & 3; const int r3 = i3 < 4 ? 3 : i3 & 3; // Find permute indices for high 16-bit permute from swapped t1 const int s0 = i4 < 0 || i4 >= 4 ? 0 : i4 & 3; const int s1 = i5 < 0 || i5 >= 4 ? 1 : i5 & 3; const int s2 = i6 < 0 || i6 >= 4 ? 2 : i6 & 3; const int s3 = i7 < 0 || i7 >= 4 ? 3 : i7 & 3; // Find permute indices for low 16-bit permute from direct a const int u0 = i0 < 0 || i0 >= 4 ? 0 : i0 & 3; const int u1 = i1 < 0 || i1 >= 4 ? 1 : i1 & 3; const int u2 = i2 < 0 || i2 >= 4 ? 2 : i2 & 3; const int u3 = i3 < 0 || i3 >= 4 ? 3 : i3 & 3; // Find permute indices for high 16-bit permute from direct a const int v0 = i4 < 4 ? 0 : i4 & 3; const int v1 = i5 < 4 ? 1 : i5 & 3; const int v2 = i6 < 4 ? 2 : i6 & 3; const int v3 = i7 < 4 ? 3 : i7 & 3; // 16-bit permutes if (r0!=0 || r1!=1 || r2!=2 || r3!=3) { // 16 bit permute of low half t2 = _mm_shufflelo_epi16(t1, r0 | r1<<2 | r2<<4 | r3<<6); } else t2 = t1; if (u0!=0 || u1!=1 || u2!=2 || u3!=3) { // 16 bit permute of low half t3 = _mm_shufflelo_epi16(a, u0 | u1<<2 | u2<<4 | u3<<6); } else t3 = a; if (s0!=0 || s1!=1 || s2!=2 || s3!=3) { // 16 bit permute of low half t4 = _mm_shufflehi_epi16(t2, s0 | s1<<2 | s2<<4 | s3<<6); } else t4 = t2; if (v0!=0 || v1!=1 || v2!=2 || v3!=3) { // 16 bit permute of low half t5 = _mm_shufflehi_epi16(t3, v0 | v1<<2 | v2<<4 | v3<<6); } else t5 = t3; // merge data from t4 and t5 t6 = constant4i < ((i0 & 4) ? 0xFFFF : 0) | ((i1 & 4) ? 0xFFFF0000 : 0), ((i2 & 4) ? 0xFFFF : 0) | ((i3 & 4) ? 0xFFFF0000 : 0), ((i4 & 4) ? 0 : 0xFFFF) | ((i5 & 4) ? 0 : 0xFFFF0000), ((i6 & 4) ? 0 : 0xFFFF) | ((i7 & 4) ? 0 : 0xFFFF0000) > (); t7 = selectb(t6,t4,t5); // select between permuted data t4 and t5 } } // Set any elements to zero if required if (m2 != -1 && ((i0 | i1 | i2 | i3 | i4 | i5 | i6 | i7) & 0x80)) { // some elements need to be set to 0 __m128i mask = constant4i < (i0 < 0 ? 0xFFFF0000 : -1) & (i1 < 0 ? 0x0000FFFF : -1), (i2 < 0 ? 0xFFFF0000 : -1) & (i3 < 0 ? 0x0000FFFF : -1), (i4 < 0 ? 0xFFFF0000 : -1) & (i5 < 0 ? 0x0000FFFF : -1), (i6 < 0 ? 0xFFFF0000 : -1) & (i7 < 0 ? 0x0000FFFF : -1) > (); return _mm_and_si128(t7,mask); } else { return t7; } #endif } template static inline Vec8us permute8us(Vec8us const & a) { return Vec8us (permute8s (a)); } template static inline Vec16c permute16c(Vec16c const & a) { __m128i temp; // Combine all even indexes into a single bitfield, with 4 bits for each const uint32_t me = (i0&15) | (i2&15)<<4 | (i4&15)<<8 | (i6&15)<<12 | (i8&15)<<16 | (i10&15)<<20 | (i12&15)<<24 | (i14&15)<<28; // Combine all odd indexes into a single bitfield, with 4 bits for each const uint32_t mo = (i1&15) | (i3&15)<<4 | (i5&15)<<8 | (i7&15)<<12 | (i9&15)<<16 | (i11&15)<<20 | (i13&15)<<24 | (i15&15)<<28; // Mask indicating sign of all even indexes, with 4 bits for each, 0 for negative, 0xF for non-negative const uint32_t se = (i0<0?0:0xF) | (i2<0?0:0xF)<<4 | (i4<0?0:0xF)<<8 | (i6<0?0:0xF)<<12 | (i8<0?0:0xF)<<16 | (i10<0?0:0xF)<<20 | (i12<0?0:0xF)<<24 | (i14<0?0:0xF)<<28; // Mask indicating sign of all odd indexes, with 4 bits for each, 0 for negative, 0xF for non-negative const uint32_t so = (i1<0?0:0xF) | (i3<0?0:0xF)<<4 | (i5<0?0:0xF)<<8 | (i7<0?0:0xF)<<12 | (i9<0?0:0xF)<<16 | (i11<0?0:0xF)<<20 | (i13<0?0:0xF)<<24 | (i15<0?0:0xF)<<28; // Mask indicating sign of all indexes, with 2 bits for each, 0 for negative (means set to zero or don't care), 0x3 for non-negative const uint32_t ss = (se & 0x33333333) | (so & 0xCCCCCCCC); // Mask indicating required zeroing of all indexes, with 2 bits for each, 0 for index = -1, 3 for index >= 0 or -256 const uint32_t ssz = ((i0&0x80)?0:3) | ((i1 &0x80)?0:3)<< 2 | ((i2 &0x80)?0:3)<< 4 | ((i3 &0x80)?0:3)<< 6 | ((i4 &0x80)?0:3)<< 8 | ((i5 &0x80)?0:3)<<10 | ((i6 &0x80)?0:3)<<12 | ((i7 &0x80)?0:3)<<14 | ((i8 &0x80)?0:3)<<16 | ((i9 &0x80)?0:3)<<18 | ((i10&0x80)?0:3)<<20 | ((i11&0x80)?0:3)<<22 | ((i12&0x80)?0:3)<<24 | ((i13&0x80)?0:3)<<26 | ((i14&0x80)?0:3)<<28 | ((i15&0x80)?0:3)<<30 ; // These indexes are used only to avoid bogus compiler warnings in false branches const int I0 = i0 > 0 ? (i0 & 0xF) : 0; const int I15 = i15 > 0 ? (i15 & 0xF) : 0; // special case: all zero if (ss == 0) { return _mm_setzero_si128(); } // remember if extra zeroing is needed bool do_and_zero = (ssz != 0xFFFFFFFFu); // check for special shortcut cases int shortcut = 0; // check if any permutation if (((me ^ 0xECA86420) & se) == 0 && ((mo ^ 0xFDB97531) & so) == 0) { shortcut = 1; } // check if we can use punpcklbw else if (((me ^ 0x76543210) & se) == 0 && ((mo ^ 0x76543210) & so) == 0) { shortcut = 2; } // check if we can use punpckhbw else if (((me ^ 0xFEDCBA98) & se) == 0 && ((mo ^ 0xFEDCBA98) & so) == 0) { shortcut = 3; } #if defined (_MSC_VER) && ! defined(__INTEL_COMPILER) #pragma warning(disable: 4307) // disable MS warning C4307: '+' : integral constant overflow #endif // check if we can use byte shift right else if (i0 > 0 && ((me ^ (uint32_t(I0)*0x11111111u + 0xECA86420u)) & se) == 0 && ((mo ^ (uint32_t(I0)*0x11111111u + 0xFDB97531u)) & so) == 0) { shortcut = 4; do_and_zero = ((0xFFFFFFFFu >> 2*I0) & ~ ssz) != 0; } // check if we can use byte shift left else if (i15 >= 0 && i15 < 15 && ((mo ^ (uint32_t(I15*0x11111111u) - (0x02468ACEu & so))) & so) == 0 && ((me ^ (uint32_t(I15*0x11111111u) - (0x13579BDFu & se))) & se) == 0) { shortcut = 5; do_and_zero = ((0xFFFFFFFFu << 2*(15-I15)) & ~ ssz) != 0; } #if INSTRSET >= 4 // SSSE3 (PSHUFB available only under SSSE3) // special case: rotate if (i0>0 && i0 < 16 && i1==((i0+1)&15) && i2 ==((i0+2 )&15) && i3 ==((i0+3 )&15) && i4 ==((i0+4 )&15) && i5 ==((i0+5 )&15) && i6 ==((i0+6 )&15) && i7 ==((i0+7 )&15) && i8==((i0+8)&15) && i9==((i0+9)&15) && i10==((i0+10)&15) && i11==((i0+11)&15) && i12==((i0+12)&15) && i13==((i0+13)&15) && i14==((i0+14)&15) && i15==((i0+15)&15)) { temp = _mm_alignr_epi8(a, a, i0 & 15); shortcut = -1; } if (shortcut == 0 || do_and_zero) { // general case: use PSHUFB __m128i mask = constant4i< (i0 & 0xFF) | (i1 & 0xFF) << 8 | (i2 & 0xFF) << 16 | (i3 & 0xFF) << 24 , (i4 & 0xFF) | (i5 & 0xFF) << 8 | (i6 & 0xFF) << 16 | (i7 & 0xFF) << 24 , (i8 & 0xFF) | (i9 & 0xFF) << 8 | (i10 & 0xFF) << 16 | (i11 & 0xFF) << 24 , (i12 & 0xFF) | (i13 & 0xFF) << 8 | (i14 & 0xFF) << 16 | (i15 & 0xFF) << 24 > (); temp = _mm_shuffle_epi8(a,mask); shortcut = -1; do_and_zero = false; } #endif // Check if we can use 16-bit permute. Even numbered indexes must be even and odd numbered // indexes must be equal to the preceding index + 1, except for negative indexes. if (shortcut == 0 && (me & 0x11111111 & se) == 0 && ((mo ^ 0x11111111) & 0x11111111 & so) == 0 && ((me ^ mo) & 0xEEEEEEEE & se & so) == 0) { temp = permute8s < i0 >= 0 ? i0 /2 : i1 >= 0 ? i1 /2 : (i0 | i1 ), i2 >= 0 ? i2 /2 : i3 >= 0 ? i3 /2 : (i2 | i3 ), i4 >= 0 ? i4 /2 : i5 >= 0 ? i5 /2 : (i4 | i5 ), i6 >= 0 ? i6 /2 : i7 >= 0 ? i7 /2 : (i6 | i7 ), i8 >= 0 ? i8 /2 : i9 >= 0 ? i9 /2 : (i8 | i9 ), i10 >= 0 ? i10/2 : i11 >= 0 ? i11/2 : (i10 | i11), i12 >= 0 ? i12/2 : i13 >= 0 ? i13/2 : (i12 | i13), i14 >= 0 ? i14/2 : i15 >= 0 ? i15/2 : (i14 | i15) > (Vec8s(a)); shortcut = 100; do_and_zero = (se != so && ssz != 0xFFFFFFFFu); } // Check if we can use 16-bit permute with bytes swapped. Even numbered indexes must be odd and odd // numbered indexes must be equal to the preceding index - 1, except for negative indexes. // (this case occurs when reversing byte order) if (shortcut == 0 && ((me ^ 0x11111111) & 0x11111111 & se) == 0 && (mo & 0x11111111 & so) == 0 && ((me ^ mo) & 0xEEEEEEEE & se & so) == 0) { Vec16c swapped = Vec16c(rotate_left(Vec8s(a), 8)); // swap odd and even bytes temp = permute8s < i0 >= 0 ? i0 /2 : i1 >= 0 ? i1 /2 : (i0 | i1 ), i2 >= 0 ? i2 /2 : i3 >= 0 ? i3 /2 : (i2 | i3 ), i4 >= 0 ? i4 /2 : i5 >= 0 ? i5 /2 : (i4 | i5 ), i6 >= 0 ? i6 /2 : i7 >= 0 ? i7 /2 : (i6 | i7 ), i8 >= 0 ? i8 /2 : i9 >= 0 ? i9 /2 : (i8 | i9 ), i10 >= 0 ? i10/2 : i11 >= 0 ? i11/2 : (i10 | i11), i12 >= 0 ? i12/2 : i13 >= 0 ? i13/2 : (i12 | i13), i14 >= 0 ? i14/2 : i15 >= 0 ? i15/2 : (i14 | i15) > (Vec8s(swapped)); shortcut = 101; do_and_zero = (se != so && ssz != 0xFFFFFFFFu); } // all shortcuts end here if (shortcut) { switch (shortcut) { case 1: temp = a; break; case 2: temp = _mm_unpacklo_epi8(a,a); break; case 3: temp = _mm_unpackhi_epi8(a,a); break; case 4: temp = _mm_srli_si128(a, I0); break; case 5: temp = _mm_slli_si128(a, 15-I15); break; default: break; // result is already in temp } if (do_and_zero) { // additional zeroing needed __m128i maskz = constant4i < (i0 < 0 ? 0 : 0xFF) | (i1 < 0 ? 0 : 0xFF00) | (i2 < 0 ? 0 : 0xFF0000) | (i3 < 0 ? 0 : 0xFF000000) , (i4 < 0 ? 0 : 0xFF) | (i5 < 0 ? 0 : 0xFF00) | (i6 < 0 ? 0 : 0xFF0000) | (i7 < 0 ? 0 : 0xFF000000) , (i8 < 0 ? 0 : 0xFF) | (i9 < 0 ? 0 : 0xFF00) | (i10 < 0 ? 0 : 0xFF0000) | (i11 < 0 ? 0 : 0xFF000000) , (i12 < 0 ? 0 : 0xFF) | (i13 < 0 ? 0 : 0xFF00) | (i14 < 0 ? 0 : 0xFF0000) | (i15 < 0 ? 0 : 0xFF000000) > (); temp = _mm_and_si128(temp, maskz); } return temp; } // complicated cases: use 16-bit permute up to four times const bool e2e = (~me & 0x11111111 & se) != 0; // even bytes of source to even bytes of destination const bool e2o = (~mo & 0x11111111 & so) != 0; // even bytes of source to odd bytes of destination const bool o2e = (me & 0x11111111 & se) != 0; // odd bytes of source to even bytes of destination const bool o2o = (mo & 0x11111111 & so) != 0; // odd bytes of source to odd bytes of destination Vec16c swapped, te2e, te2o, to2e, to2o, combeven, combodd; if (e2o || o2e) swapped = rotate_left(Vec8s(a), 8); // swap odd and even bytes // even-to-even bytes if (e2e) te2e = permute8s <(i0&1)?-1:i0/2, (i2&1)?-1:i2/2, (i4&1)?-1:i4/2, (i6&1)?-1:i6/2, (i8&1)?-1:i8/2, (i10&1)?-1:i10/2, (i12&1)?-1:i12/2, (i14&1)?-1:i14/2> (Vec8s(a)); // odd-to-even bytes if (o2e) to2e = permute8s <(i0&1)?i0/2:-1, (i2&1)?i2/2:-1, (i4&1)?i4/2:-1, (i6&1)?i6/2:-1, (i8&1)?i8/2:-1, (i10&1)?i10/2:-1, (i12&1)?i12/2:-1, (i14&1)?i14/2:-1> (Vec8s(swapped)); // even-to-odd bytes if (e2o) te2o = permute8s <(i1&1)?-1:i1/2, (i3&1)?-1:i3/2, (i5&1)?-1:i5/2, (i7&1)?-1:i7/2, (i9&1)?-1:i9/2, (i11&1)?-1:i11/2, (i13&1)?-1:i13/2, (i15&1)?-1:i15/2> (Vec8s(swapped)); // odd-to-odd bytes if (o2o) to2o = permute8s <(i1&1)?i1/2:-1, (i3&1)?i3/2:-1, (i5&1)?i5/2:-1, (i7&1)?i7/2:-1, (i9&1)?i9/2:-1, (i11&1)?i11/2:-1, (i13&1)?i13/2:-1, (i15&1)?i15/2:-1> (Vec8s(a)); if (e2e && o2e) combeven = te2e | to2e; else if (e2e) combeven = te2e; else if (o2e) combeven = to2e; else combeven = _mm_setzero_si128(); if (e2o && o2o) combodd = te2o | to2o; else if (e2o) combodd = te2o; else if (o2o) combodd = to2o; else combodd = _mm_setzero_si128(); __m128i maske = constant4i < // mask used even bytes (i0 < 0 ? 0 : 0xFF) | (i2 < 0 ? 0 : 0xFF0000), (i4 < 0 ? 0 : 0xFF) | (i6 < 0 ? 0 : 0xFF0000), (i8 < 0 ? 0 : 0xFF) | (i10 < 0 ? 0 : 0xFF0000), (i12 < 0 ? 0 : 0xFF) | (i14 < 0 ? 0 : 0xFF0000) > (); __m128i masko = constant4i < // mask used odd bytes (i1 < 0 ? 0 : 0xFF00) | (i3 < 0 ? 0 : 0xFF000000), (i5 < 0 ? 0 : 0xFF00) | (i7 < 0 ? 0 : 0xFF000000), (i9 < 0 ? 0 : 0xFF00) | (i11 < 0 ? 0 : 0xFF000000), (i13 < 0 ? 0 : 0xFF00) | (i15 < 0 ? 0 : 0xFF000000) > (); return _mm_or_si128( // combine even and odd bytes _mm_and_si128(combeven, maske), _mm_and_si128(combodd, masko)); } template static inline Vec16uc permute16uc(Vec16uc const & a) { return Vec16uc (permute16c (a)); } /***************************************************************************** * * Vector blend functions * ****************************************************************************** * * These blend functions can mix elements from two different vectors and * optionally set some elements to zero. * * The indexes are inserted as template parameters in <>. These indexes must be * constants. Each template parameter is an index to the element you want to * select, where higher indexes indicate an element from the second source * vector. For example, if each vector has 4 elements, then indexes 0 - 3 * will select an element from the first vector and indexes 4 - 7 will select * an element from the second vector. A negative index will generate zero. * * The blend functions for vectors of 8-bit integers are inefficient if * the SSSE3 instruction set or later is not enabled. * * Example: * Vec4i a(100,101,102,103); // a is (100, 101, 102, 103) * Vec4i b(200,201,202,203); // b is (200, 201, 202, 203) * Vec4i c; * c = blend4i<1,4,-1,7> (a,b); // c is (101, 200, 0, 203) * * A lot of the code here is metaprogramming aiming to find the instructions * that best fit the template parameters and instruction set. The metacode * will be reduced out to leave only a few vector instructions in release * mode with optimization on. *****************************************************************************/ template static inline Vec16c blend16c(Vec16c const & a, Vec16c const & b) { // Combine bit 0-3 of all even indexes into a single bitfield, with 4 bits for each const int me = (i0&15) | (i2&15)<<4 | (i4&15)<<8 | (i6&15)<<12 | (i8&15)<<16 | (i10&15)<<20 | (i12&15)<<24 | (i14&15)<<28; // Combine bit 0-3 of all odd indexes into a single bitfield, with 4 bits for each const int mo = (i1&15) | (i3&15)<<4 | (i5&15)<<8 | (i7&15)<<12 | (i9&15)<<16 | (i11&15)<<20 | (i13&15)<<24 | (i15&15)<<28; // Mask indicating sign of all even indexes, with 4 bits for each, 0 for negative, 0xF for non-negative const int se = (i0<0?0:0xF) | (i2<0?0:0xF)<<4 | (i4<0?0:0xF)<<8 | (i6<0?0:0xF)<<12 | (i8<0?0:0xF)<<16 | (i10<0?0:0xF)<<20 | (i12<0?0:0xF)<<24 | (i14<0?0:0xF)<<28; // Mask indicating sign of all odd indexes, with 4 bits for each, 0 for negative, 0xF for non-negative const int so = (i1<0?0:0xF) | (i3<0?0:0xF)<<4 | (i5<0?0:0xF)<<8 | (i7<0?0:0xF)<<12 | (i9<0?0:0xF)<<16 | (i11<0?0:0xF)<<20 | (i13<0?0:0xF)<<24 | (i15<0?0:0xF)<<28; // Combine bit 4 of all even indexes into a single bitfield, with 4 bits for each const int ne = (i0&16)>>4 | (i2&16) | (i4&16)<<4 | (i6&16)<<8 | (i8&16)<<12 | (i10&16)<<16 | (i12&16)<<20 | (i14&16)<<24; // Combine bit 4 of all odd indexes into a single bitfield, with 4 bits for each const int no = (i1&16)>>4 | (i3&16) | (i5&16)<<4 | (i7&16)<<8 | (i9&16)<<12 | (i11&16)<<16 | (i13&16)<<20 | (i15&16)<<24; // Check if zeroing needed const bool do_zero = ((i0|i1|i2|i3|i4|i5|i6|i7|i8|i9|i10|i11|i12|i13|i14|i15) & 0x80) != 0; // needs zeroing // no elements from b if (((ne & se) | (no & so)) == 0) { return permute16c (a); } // no elements from a if ((((ne^0x11111111) & se) | ((no^0x11111111) & so)) == 0) { return permute16c (b); } __m128i t; // check if we can use punpcklbw if (((me ^ 0x76543210) & se) == 0 && ((mo ^ 0x76543210) & so) == 0) { if ((ne & se) == 0 && ((no ^ 0x11111111) & so) == 0) { t = _mm_unpacklo_epi8(a,b); } if ((no & so) == 0 && ((ne ^ 0x11111111) & se) == 0) { t = _mm_unpacklo_epi8(b,a); } if (do_zero) { // additional zeroing needed __m128i maskz = constant4i < (i0 < 0 ? 0 : 0xFF) | (i1 < 0 ? 0 : 0xFF00) | (i2 < 0 ? 0 : 0xFF0000) | (i3 < 0 ? 0 : 0xFF000000) , (i4 < 0 ? 0 : 0xFF) | (i5 < 0 ? 0 : 0xFF00) | (i6 < 0 ? 0 : 0xFF0000) | (i7 < 0 ? 0 : 0xFF000000) , (i8 < 0 ? 0 : 0xFF) | (i9 < 0 ? 0 : 0xFF00) | (i10 < 0 ? 0 : 0xFF0000) | (i11 < 0 ? 0 : 0xFF000000) , (i12 < 0 ? 0 : 0xFF) | (i13 < 0 ? 0 : 0xFF00) | (i14 < 0 ? 0 : 0xFF0000) | (i15 < 0 ? 0 : 0xFF000000) > (); t = _mm_and_si128(t, maskz); } return t; } // check if we can use punpckhbw if (((me ^ 0xFEDCBA98) & se) == 0 && ((mo ^ 0xFEDCBA98) & so) == 0) { if ((ne & se) == 0 && ((no ^ 0x11111111) & so) == 0) { t = _mm_unpackhi_epi8(a,b); } if ((no & so) == 0 && ((ne ^ 0x11111111) & se) == 0) { t = _mm_unpackhi_epi8(b,a); } if (do_zero) { // additional zeroing needed __m128i maskz = constant4i < (i0 < 0 ? 0 : 0xFF) | (i1 < 0 ? 0 : 0xFF00) | (i2 < 0 ? 0 : 0xFF0000) | (i3 < 0 ? 0 : 0xFF000000) , (i4 < 0 ? 0 : 0xFF) | (i5 < 0 ? 0 : 0xFF00) | (i6 < 0 ? 0 : 0xFF0000) | (i7 < 0 ? 0 : 0xFF000000) , (i8 < 0 ? 0 : 0xFF) | (i9 < 0 ? 0 : 0xFF00) | (i10 < 0 ? 0 : 0xFF0000) | (i11 < 0 ? 0 : 0xFF000000) , (i12 < 0 ? 0 : 0xFF) | (i13 < 0 ? 0 : 0xFF00) | (i14 < 0 ? 0 : 0xFF0000) | (i15 < 0 ? 0 : 0xFF000000) > (); t = _mm_and_si128(t, maskz); } return t; } #if INSTRSET >= 4 // SSSE3 // special case: shift left if (i0 > 0 && i0 < 16 && i1==i0+1 && i2==i0+2 && i3==i0+3 && i4==i0+4 && i5==i0+5 && i6==i0+6 && i7==i0+7 && i8==i0+8 && i9==i0+9 && i10==i0+10 && i11==i0+11 && i12==i0+12 && i13==i0+13 && i14==i0+14 && i15==i0+15) { return _mm_alignr_epi8(b, a, (i0 & 15)); } // special case: shift right if (i0 > 15 && i0 < 32 && i1==((i0+1)&31) && i2 ==((i0+2 )&31) && i3 ==((i0+3 )&31) && i4 ==((i0+4 )&31) && i5 ==((i0+5 )&31) && i6 ==((i0+6 )&31) && i7 ==((i0+7 )&31) && i8==((i0+8 )&31) && i9==((i0+9)&31) && i10==((i0+10)&31) && i11==((i0+11)&31) && i12==((i0+12)&31) && i13==((i0+13)&31) && i14==((i0+14)&31) && i15==((i0+15)&31)) { return _mm_alignr_epi8(a, b, (i0 & 15)); } #endif #if INSTRSET >= 5 // SSE4.1 supported // special case: blend without permute if (((me ^ 0xECA86420) & se) == 0 && ((mo ^ 0xFDB97531) & so) == 0) { __m128i maskbl = constant4i< ((i0 & 16) ? 0xFF : 0) | ((i1 & 16) ? 0xFF00 : 0) | ((i2 & 16) ? 0xFF0000 : 0) | ((i3 & 16) ? 0xFF000000 : 0) , ((i4 & 16) ? 0xFF : 0) | ((i5 & 16) ? 0xFF00 : 0) | ((i6 & 16) ? 0xFF0000 : 0) | ((i7 & 16) ? 0xFF000000 : 0) , ((i8 & 16) ? 0xFF : 0) | ((i9 & 16) ? 0xFF00 : 0) | ((i10& 16) ? 0xFF0000 : 0) | ((i11& 16) ? 0xFF000000 : 0) , ((i12& 16) ? 0xFF : 0) | ((i13& 16) ? 0xFF00 : 0) | ((i14& 16) ? 0xFF0000 : 0) | ((i15& 16) ? 0xFF000000 : 0) > (); t = _mm_blendv_epi8(a, b, maskbl); if (do_zero) { // additional zeroing needed __m128i maskz = constant4i < (i0 < 0 ? 0 : 0xFF) | (i1 < 0 ? 0 : 0xFF00) | (i2 < 0 ? 0 : 0xFF0000) | (i3 < 0 ? 0 : 0xFF000000) , (i4 < 0 ? 0 : 0xFF) | (i5 < 0 ? 0 : 0xFF00) | (i6 < 0 ? 0 : 0xFF0000) | (i7 < 0 ? 0 : 0xFF000000) , (i8 < 0 ? 0 : 0xFF) | (i9 < 0 ? 0 : 0xFF00) | (i10 < 0 ? 0 : 0xFF0000) | (i11 < 0 ? 0 : 0xFF000000) , (i12 < 0 ? 0 : 0xFF) | (i13 < 0 ? 0 : 0xFF00) | (i14 < 0 ? 0 : 0xFF0000) | (i15 < 0 ? 0 : 0xFF000000) > (); t = _mm_and_si128(t, maskz); } return t; } #endif // SSE4.1 #if defined ( __XOP__ ) // Use AMD XOP instruction VPPERM __m128i mask = constant4i< (i0 <0 ? 0x80 : (i0 &31)) | (i1 <0 ? 0x80 : (i1 &31)) << 8 | (i2 <0 ? 0x80 : (i2 &31)) << 16 | (i3 <0 ? 0x80 : (i3 &31)) << 24, (i4 <0 ? 0x80 : (i4 &31)) | (i5 <0 ? 0x80 : (i5 &31)) << 8 | (i6 <0 ? 0x80 : (i6 &31)) << 16 | (i7 <0 ? 0x80 : (i7 &31)) << 24, (i8 <0 ? 0x80 : (i8 &31)) | (i9 <0 ? 0x80 : (i9 &31)) << 8 | (i10<0 ? 0x80 : (i10&31)) << 16 | (i11<0 ? 0x80 : (i11&31)) << 24, (i12<0 ? 0x80 : (i12&31)) | (i13<0 ? 0x80 : (i13&31)) << 8 | (i14<0 ? 0x80 : (i14&31)) << 16 | (i15<0 ? 0x80 : (i15&31)) << 24 > (); return _mm_perm_epi8(a, b, mask); #elif INSTRSET >= 4 // SSSE3 // general case. Use PSHUFB __m128i maska = constant4i< ((i0 & 0x90) ? 0xFF : (i0 &15)) | ((i1 & 0x90) ? 0xFF : (i1 &15)) << 8 | ((i2 & 0x90) ? 0xFF : (i2 &15)) << 16 | ((i3 & 0x90) ? 0xFF : (i3 &15)) << 24, ((i4 & 0x90) ? 0xFF : (i4 &15)) | ((i5 & 0x90) ? 0xFF : (i5 &15)) << 8 | ((i6 & 0x90) ? 0xFF : (i6 &15)) << 16 | ((i7 & 0x90) ? 0xFF : (i7 &15)) << 24, ((i8 & 0x90) ? 0xFF : (i8 &15)) | ((i9 & 0x90) ? 0xFF : (i9 &15)) << 8 | ((i10& 0x90) ? 0xFF : (i10&15)) << 16 | ((i11& 0x90) ? 0xFF : (i11&15)) << 24, ((i12& 0x90) ? 0xFF : (i12&15)) | ((i13& 0x90) ? 0xFF : (i13&15)) << 8 | ((i14& 0x90) ? 0xFF : (i14&15)) << 16 | ((i15& 0x90) ? 0xFF : (i15&15)) << 24 > (); __m128i maskb = constant4i< (((i0^0x10) & 0x90) ? 0xFF : (i0 &15)) | (((i1^0x10) & 0x90) ? 0xFF : (i1 &15)) << 8 | (((i2^0x10) & 0x90) ? 0xFF : (i2 &15)) << 16 | (((i3^0x10) & 0x90) ? 0xFF : (i3 &15)) << 24, (((i4^0x10) & 0x90) ? 0xFF : (i4 &15)) | (((i5^0x10) & 0x90) ? 0xFF : (i5 &15)) << 8 | (((i6^0x10) & 0x90) ? 0xFF : (i6 &15)) << 16 | (((i7^0x10) & 0x90) ? 0xFF : (i7 &15)) << 24, (((i8^0x10) & 0x90) ? 0xFF : (i8 &15)) | (((i9^0x10) & 0x90) ? 0xFF : (i9 &15)) << 8 | (((i10^0x10)& 0x90) ? 0xFF : (i10&15)) << 16 | (((i11^0x10)& 0x90) ? 0xFF : (i11&15)) << 24, (((i12^0x10)& 0x90) ? 0xFF : (i12&15)) | (((i13^0x10)& 0x90) ? 0xFF : (i13&15)) << 8 | (((i14^0x10)& 0x90) ? 0xFF : (i14&15)) << 16 | (((i15^0x10)& 0x90) ? 0xFF : (i15&15)) << 24 > (); __m128i a1 = _mm_shuffle_epi8(a,maska); __m128i b1 = _mm_shuffle_epi8(b,maskb); return _mm_or_si128(a1,b1); #else // SSE2 // combine two permutes __m128i a1 = permute16c < (uint32_t)i0 < 16 ? i0 : -1, (uint32_t)i1 < 16 ? i1 : -1, (uint32_t)i2 < 16 ? i2 : -1, (uint32_t)i3 < 16 ? i3 : -1, (uint32_t)i4 < 16 ? i4 : -1, (uint32_t)i5 < 16 ? i5 : -1, (uint32_t)i6 < 16 ? i6 : -1, (uint32_t)i7 < 16 ? i7 : -1, (uint32_t)i8 < 16 ? i8 : -1, (uint32_t)i9 < 16 ? i9 : -1, (uint32_t)i10 < 16 ? i10 : -1, (uint32_t)i11 < 16 ? i11 : -1, (uint32_t)i12 < 16 ? i12 : -1, (uint32_t)i13 < 16 ? i13 : -1, (uint32_t)i14 < 16 ? i14 : -1, (uint32_t)i15 < 16 ? i15 : -1 > (a); __m128i b1 = permute16c < (uint32_t)(i0 ^16) < 16 ? (i0 ^16) : -1, (uint32_t)(i1 ^16) < 16 ? (i1 ^16) : -1, (uint32_t)(i2 ^16) < 16 ? (i2 ^16) : -1, (uint32_t)(i3 ^16) < 16 ? (i3 ^16) : -1, (uint32_t)(i4 ^16) < 16 ? (i4 ^16) : -1, (uint32_t)(i5 ^16) < 16 ? (i5 ^16) : -1, (uint32_t)(i6 ^16) < 16 ? (i6 ^16) : -1, (uint32_t)(i7 ^16) < 16 ? (i7 ^16) : -1, (uint32_t)(i8 ^16) < 16 ? (i8 ^16) : -1, (uint32_t)(i9 ^16) < 16 ? (i9 ^16) : -1, (uint32_t)(i10^16) < 16 ? (i10^16) : -1, (uint32_t)(i11^16) < 16 ? (i11^16) : -1, (uint32_t)(i12^16) < 16 ? (i12^16) : -1, (uint32_t)(i13^16) < 16 ? (i13^16) : -1, (uint32_t)(i14^16) < 16 ? (i14^16) : -1, (uint32_t)(i15^16) < 16 ? (i15^16) : -1 > (b); return _mm_or_si128(a1,b1); #endif } template static inline Vec16uc blend16uc(Vec16uc const & a, Vec16uc const & b) { return Vec16uc( blend16c (a,b)); } template static inline Vec8s blend8s(Vec8s const & a, Vec8s const & b) { // Combine all the indexes into a single bitfield, with 4 bits for each const int m1 = (i0&0xF) | (i1&0xF)<<4 | (i2&0xF)<<8 | (i3&0xF)<<12 | (i4&0xF)<<16 | (i5&0xF)<<20 | (i6&0xF)<<24 | (i7&0xF)<<28; // Mask to zero out negative indexes const int mz = (i0<0?0:0xF) | (i1<0?0:0xF)<<4 | (i2<0?0:0xF)<<8 | (i3<0?0:0xF)<<12 | (i4<0?0:0xF)<<16 | (i5<0?0:0xF)<<20 | (i6<0?0:0xF)<<24 | (i7<0?0:0xF)<<28; // Some elements must be set to zero const bool do_zero = (mz != -1) && ((i0 | i1 | i2 | i3 | i4 | i5 | i6 | i7) & 0x80) != 0; // temp contains temporary result, some zeroing needs to be done bool zeroing_pending = false; // partially finished result __m128i temp; if ((m1 & 0x88888888 & mz) == 0) { // no elements from b return permute8s (a); } if (((m1^0x88888888) & 0x88888888 & mz) == 0) { // no elements from a return permute8s (b); } // special case: PUNPCKLWD if (((m1 ^ 0xB3A29180) & mz) == 0) { temp = _mm_unpacklo_epi16(a, b); if (do_zero) zeroing_pending = true; else return temp; } if (((m1 ^ 0x3B2A1908) & mz) == 0) { temp = _mm_unpacklo_epi16(b, a); if (do_zero) zeroing_pending = true; else return temp; } // special case: PUNPCKHWD if (((m1 ^ 0xF7E6D5C4) & mz) == 0) { temp = _mm_unpackhi_epi16(a, b); if (do_zero) zeroing_pending = true; else return temp; } if (((m1 ^ 0x7F6E5D4C) & mz) == 0) { temp = _mm_unpackhi_epi16(b, a); if (do_zero) zeroing_pending = true; else return temp; } #if INSTRSET >= 4 // SSSE3 // special case: shift left if (i0 > 0 && i0 < 8 && ((m1 ^ ((i0 & 7) * 0x11111111u + 0x76543210u)) & mz) == 0) { temp = _mm_alignr_epi8(b, a, (i0 & 7) * 2); if (do_zero) zeroing_pending = true; else return temp; } // special case: shift right if (i0 > 8 && i0 < 16 && ((m1 ^ 0x88888888 ^ ((i0 & 7) * 0x11111111u + 0x76543210u)) & mz) == 0) { temp = _mm_alignr_epi8(a, b, (i0 & 7) * 2); if (do_zero) zeroing_pending = true; else return temp; } #endif // SSSE3 #if INSTRSET >= 5 // SSE4.1 supported // special case: blending without permuting if ((((m1 & ~0x88888888) ^ 0x76543210) & mz) == 0) { temp = _mm_blend_epi16(a, b, (i0>>3&1) | (i1>>3&1)<<1 | (i2>>3&1)<<2 | (i3>>3&1)<<3 | (i4>>3&1)<<4 | (i5>>3&1)<<5 | (i6>>3&1)<<6 | (i7>>3&1)<<7); if (do_zero) zeroing_pending = true; else return temp; } #endif // SSE4.1 if (zeroing_pending) { // additional zeroing of temp needed __m128i maskz = constant4i < (i0 < 0 ? 0 : 0xFFFF) | (i1 < 0 ? 0 : 0xFFFF0000) , (i2 < 0 ? 0 : 0xFFFF) | (i3 < 0 ? 0 : 0xFFFF0000) , (i4 < 0 ? 0 : 0xFFFF) | (i5 < 0 ? 0 : 0xFFFF0000) , (i6 < 0 ? 0 : 0xFFFF) | (i7 < 0 ? 0 : 0xFFFF0000) > (); return _mm_and_si128(temp, maskz); } // general case #ifdef __XOP__ // Use AMD XOP instruction PPERM __m128i mask = constant4i < (i0 < 0 ? 0x8080 : (i0*2 & 31) | ((i0*2 & 31)+1)<<8) | (i1 < 0 ? 0x80800000 : ((i1*2 & 31)<<16) | ((i1*2 & 31)+1)<<24), (i2 < 0 ? 0x8080 : (i2*2 & 31) | ((i2*2 & 31)+1)<<8) | (i3 < 0 ? 0x80800000 : ((i3*2 & 31)<<16) | ((i3*2 & 31)+1)<<24), (i4 < 0 ? 0x8080 : (i4*2 & 31) | ((i4*2 & 31)+1)<<8) | (i5 < 0 ? 0x80800000 : ((i5*2 & 31)<<16) | ((i5*2 & 31)+1)<<24), (i6 < 0 ? 0x8080 : (i6*2 & 31) | ((i6*2 & 31)+1)<<8) | (i7 < 0 ? 0x80800000 : ((i7*2 & 31)<<16) | ((i7*2 & 31)+1)<<24) > (); return _mm_perm_epi8(a, b, mask); #else // combine two permutes __m128i a1 = permute8s < (uint32_t)i0 < 8 ? i0 : -1, (uint32_t)i1 < 8 ? i1 : -1, (uint32_t)i2 < 8 ? i2 : -1, (uint32_t)i3 < 8 ? i3 : -1, (uint32_t)i4 < 8 ? i4 : -1, (uint32_t)i5 < 8 ? i5 : -1, (uint32_t)i6 < 8 ? i6 : -1, (uint32_t)i7 < 8 ? i7 : -1 > (a); __m128i b1 = permute8s < (uint32_t)(i0^8) < 8 ? (i0^8) : -1, (uint32_t)(i1^8) < 8 ? (i1^8) : -1, (uint32_t)(i2^8) < 8 ? (i2^8) : -1, (uint32_t)(i3^8) < 8 ? (i3^8) : -1, (uint32_t)(i4^8) < 8 ? (i4^8) : -1, (uint32_t)(i5^8) < 8 ? (i5^8) : -1, (uint32_t)(i6^8) < 8 ? (i6^8) : -1, (uint32_t)(i7^8) < 8 ? (i7^8) : -1 > (b); return _mm_or_si128(a1,b1); #endif } template static inline Vec8us blend8us(Vec8us const & a, Vec8us const & b) { return Vec8us(blend8s (a,b)); } template static inline Vec4i blend4i(Vec4i const & a, Vec4i const & b) { // Combine all the indexes into a single bitfield, with 8 bits for each const int m1 = (i0 & 7) | (i1 & 7) << 8 | (i2 & 7) << 16 | (i3 & 7) << 24; // Mask to zero out negative indexes const int mz = (i0 < 0 ? 0 : 0xFF) | (i1 < 0 ? 0 : 0xFF) << 8 | (i2 < 0 ? 0 : 0xFF) << 16 | (i3 < 0 ? 0 : 0xFF) << 24; // Some elements must be set to zero const bool do_zero = (mz != -1) && ((i0 | i1 | i2 | i3) & 0x80) != 0; // temp contains temporary result, some zeroing needs to be done bool zeroing_pending = false; // partially finished result __m128i temp; #if defined (_MSC_VER) || defined (__clang__) temp = a; // avoid spurious warning message for temp unused #endif // special case: no elements from b if ((m1 & 0x04040404 & mz) == 0) { return permute4i(a); } // special case: no elements from a if (((m1^0x04040404) & 0x04040404 & mz) == 0) { return permute4i(b); } // special case: PUNPCKLDQ if (((m1 ^ 0x05010400) & mz) == 0) { temp = _mm_unpacklo_epi32(a, b); if (do_zero) zeroing_pending = true; else return temp; } if (((m1 ^ 0x01050004) & mz) == 0) { temp = _mm_unpacklo_epi32(b, a); if (do_zero) zeroing_pending = true; else return temp; } // special case: PUNPCKHDQ if (((m1 ^ 0x07030602) & mz) == 0) { temp = _mm_unpackhi_epi32(a, b); if (do_zero) zeroing_pending = true; else return temp; } if (((m1 ^ 0x03070206) & mz) == 0) { temp = _mm_unpackhi_epi32(b, a); if (do_zero) zeroing_pending = true; else return temp; } #if INSTRSET >= 4 // SSSE3 // special case: shift left if (i0 > 0 && i0 < 4 && ((m1 ^ ((i0 & 3) * 0x01010101u + 0x03020100u)) & mz) == 0) { temp = _mm_alignr_epi8(b, a, (i0 & 3) * 4); if (do_zero) zeroing_pending = true; else return temp; } // special case: shift right if (i0 > 4 && i0 < 8 && ((m1 ^ 0x04040404 ^ ((i0 & 3) * 0x01010101u + 0x03020100u)) & mz) == 0) { temp = _mm_alignr_epi8(a, b, (i0 & 3) * 4); if (do_zero) zeroing_pending = true; else return temp; } #endif // SSSE3 #if INSTRSET >= 5 // SSE4.1 supported if ((((m1 & ~0x04040404) ^ 0x03020100) & mz) == 0) { // blending without permuting temp = _mm_blend_epi16(a, b, ((i0>>2)&1)*3 | ((((i1>>2)&1)*3)<<2) | ((((i2>>2)&1)*3)<<4) | ((((i3>>2)&1)*3)<<6)); if (do_zero) zeroing_pending = true; else return temp; } #endif // SSE4.1 if (zeroing_pending) { // additional zeroing of temp needed __m128i maskz = constant4i < (i0 < 0 ? 0 : -1), (i1 < 0 ? 0 : -1), (i2 < 0 ? 0 : -1), (i3 < 0 ? 0 : -1) > (); return _mm_and_si128(temp, maskz); } // general case #ifdef __XOP__ // Use AMD XOP instruction PPERM __m128i mask = constant4i < i0 < 0 ? 0x80808080 : (i0*4 & 31) + (((i0*4 & 31) + 1) << 8) + (((i0*4 & 31) + 2) << 16) + (((i0*4 & 31) + 3) << 24), i1 < 0 ? 0x80808080 : (i1*4 & 31) + (((i1*4 & 31) + 1) << 8) + (((i1*4 & 31) + 2) << 16) + (((i1*4 & 31) + 3) << 24), i2 < 0 ? 0x80808080 : (i2*4 & 31) + (((i2*4 & 31) + 1) << 8) + (((i2*4 & 31) + 2) << 16) + (((i2*4 & 31) + 3) << 24), i3 < 0 ? 0x80808080 : (i3*4 & 31) + (((i3*4 & 31) + 1) << 8) + (((i3*4 & 31) + 2) << 16) + (((i3*4 & 31) + 3) << 24) > (); return _mm_perm_epi8(a, b, mask); #else // combine two permutes __m128i a1 = permute4i < (uint32_t)i0 < 4 ? i0 : -1, (uint32_t)i1 < 4 ? i1 : -1, (uint32_t)i2 < 4 ? i2 : -1, (uint32_t)i3 < 4 ? i3 : -1 > (a); __m128i b1 = permute4i < (uint32_t)(i0^4) < 4 ? (i0^4) : -1, (uint32_t)(i1^4) < 4 ? (i1^4) : -1, (uint32_t)(i2^4) < 4 ? (i2^4) : -1, (uint32_t)(i3^4) < 4 ? (i3^4) : -1 > (b); return _mm_or_si128(a1,b1); #endif } template static inline Vec4ui blend4ui(Vec4ui const & a, Vec4ui const & b) { return Vec4ui (blend4i (a,b)); } template static inline Vec2q blend2q(Vec2q const & a, Vec2q const & b) { // Combine all the indexes into a single bitfield, with 8 bits for each const int m1 = (i0&3) | (i1&3)<<8; // Mask to zero out negative indexes const int mz = (i0 < 0 ? 0 : 0xFF) | (i1 < 0 ? 0 : 0xFF) << 8; // no elements from b if ((m1 & 0x0202 & mz) == 0) { return permute2q (a); } // no elements from a if (((m1^0x0202) & 0x0202 & mz) == 0) { return permute2q (b); } // (all cases where one index is -1 or -256 would go to the above cases) // special case: PUNPCKLQDQ if (i0 == 0 && i1 == 2) { return _mm_unpacklo_epi64(a, b); } if (i0 == 2 && i1 == 0) { return _mm_unpacklo_epi64(b, a); } // special case: PUNPCKHQDQ if (i0 == 1 && i1 == 3) { return _mm_unpackhi_epi64(a, b); } if (i0 == 3 && i1 == 1) { return _mm_unpackhi_epi64(b, a); } #if INSTRSET >= 4 // SSSE3 // special case: shift left if (i0 == 1 && i1 == 2) { return _mm_alignr_epi8(b, a, 8); } // special case: shift right if (i0 == 3 && i1 == 0) { return _mm_alignr_epi8(a, b, 8); } #endif // SSSE3 #if INSTRSET >= 5 // SSE4.1 supported if (((m1 & ~0x0202) ^ 0x0100) == 0 && mz == 0xFFFF) { // blending without permuting return _mm_blend_epi16(a, b, (i0>>1 & 1) * 0xF | ((i1>>1 & 1) * 0xF) << 4 ); } #endif // SSE4.1 // general case. combine two permutes // (all cases are caught by the above special cases if SSE4.1 or higher is supported) __m128i a1, b1; a1 = permute2q <(uint32_t)i0 < 2 ? i0 : -1, (uint32_t)i1 < 2 ? i1 : -1 > (a); b1 = permute2q <(uint32_t)(i0^2) < 2 ? (i0^2) : -1, (uint32_t)(i1^2) < 2 ? (i1^2) : -1 > (b); return _mm_or_si128(a1,b1); } template static inline Vec2uq blend2uq(Vec2uq const & a, Vec2uq const & b) { return Vec2uq (blend2q ((__m128i)a, (__m128i)b)); } /***************************************************************************** * * Vector lookup functions * ****************************************************************************** * * These functions use vector elements as indexes into a table. * The table is given as one or more vectors or as an array. * * This can be used for several purposes: * - table lookup * - permute or blend with variable indexes * - blend from more than two sources * - gather non-contiguous data * * An index out of range may produce any value - the actual value produced is * implementation dependent and may be different for different instruction * sets. An index out of range does not produce an error message or exception. * * Example: * Vec4i a(2,0,0,3); // index a is ( 2, 0, 0, 3) * Vec4i b(100,101,102,103); // table b is (100, 101, 102, 103) * Vec4i c; * c = lookup4 (a,b); // c is (102, 100, 100, 103) * *****************************************************************************/ static inline Vec16c lookup16(Vec16c const & index, Vec16c const & table) { #if INSTRSET >= 5 // SSSE3 return _mm_shuffle_epi8(table, index); #else uint8_t ii[16]; int8_t tt[16], rr[16]; table.store(tt); index.store(ii); for (int j = 0; j < 16; j++) rr[j] = tt[ii[j] & 0x0F]; return Vec16c().load(rr); #endif } static inline Vec16c lookup32(Vec16c const & index, Vec16c const & table0, Vec16c const & table1) { #ifdef __XOP__ // AMD XOP instruction set. Use VPPERM return _mm_perm_epi8(table0, table1, index); #elif INSTRSET >= 5 // SSSE3 Vec16c r0 = _mm_shuffle_epi8(table0, index + 0x70); // make negative index for values >= 16 Vec16c r1 = _mm_shuffle_epi8(table1, (index ^ 0x10) + 0x70); // make negative index for values < 16 return r0 | r1; #else uint8_t ii[16]; int8_t tt[16], rr[16]; table0.store(tt); table1.store(tt+16); index.store(ii); for (int j = 0; j < 16; j++) rr[j] = tt[ii[j] & 0x1F]; return Vec16c().load(rr); #endif } template static inline Vec16c lookup(Vec16c const & index, void const * table) { if (n <= 0) return 0; if (n <= 16) return lookup16(index, Vec16c().load(table)); if (n <= 32) return lookup32(index, Vec16c().load(table), Vec16c().load((int8_t*)table + 16)); // n > 32. Limit index Vec16uc index1; if ((n & (n-1)) == 0) { // n is a power of 2, make index modulo n index1 = Vec16uc(index) & uint8_t(n-1); } else { // n is not a power of 2, limit to n-1 index1 = min(Vec16uc(index), uint8_t(n-1)); } uint8_t ii[16]; index1.store(ii); int8_t rr[16]; for (int j = 0; j < 16; j++) { rr[j] = ((int8_t*)table)[ii[j]]; } return Vec16c().load(rr); } static inline Vec8s lookup8(Vec8s const & index, Vec8s const & table) { #if INSTRSET >= 5 // SSSE3 return _mm_shuffle_epi8(table, index * 0x202 + 0x100); #else int16_t ii[8], tt[8], rr[8]; table.store(tt); index.store(ii); for (int j = 0; j < 8; j++) rr[j] = tt[ii[j] & 0x07]; return Vec8s().load(rr); #endif } static inline Vec8s lookup16(Vec8s const & index, Vec8s const & table0, Vec8s const & table1) { #ifdef __XOP__ // AMD XOP instruction set. Use VPPERM return _mm_perm_epi8(table0, table1, index * 0x202 + 0x100); #elif INSTRSET >= 5 // SSSE3 Vec8s r0 = _mm_shuffle_epi8(table0, Vec16c(index * 0x202) + Vec16c(Vec8s(0x7170))); Vec8s r1 = _mm_shuffle_epi8(table1, Vec16c(index * 0x202 ^ 0x1010) + Vec16c(Vec8s(0x7170))); return r0 | r1; #else int16_t ii[16], tt[32], rr[16]; table0.store(tt); table1.store(tt+8); index.store(ii); for (int j = 0; j < 16; j++) rr[j] = tt[ii[j] & 0x1F]; return Vec8s().load(rr); #endif } template static inline Vec8s lookup(Vec8s const & index, void const * table) { if (n <= 0) return 0; if (n <= 8) return lookup8 (index, Vec8s().load(table)); if (n <= 16) return lookup16(index, Vec8s().load(table), Vec8s().load((int16_t*)table + 8)); // n > 16. Limit index Vec8us index1; if ((n & (n-1)) == 0) { // n is a power of 2, make index modulo n index1 = Vec8us(index) & (n-1); } else { // n is not a power of 2, limit to n-1 index1 = min(Vec8us(index), n-1); } #if INSTRSET >= 8 // AVX2. Use VPERMD Vec8s t1 = _mm_i32gather_epi32((const int *)table, __m128i((Vec4i(index1)) & (Vec4i(0x0000FFFF))), 2); // even positions Vec8s t2 = _mm_i32gather_epi32((const int *)table, _mm_srli_epi32(index1, 16) , 2); // odd positions return blend8s<0,8,2,10,4,12,6,14>(t1, t2); #else uint16_t ii[8]; index1.store(ii); return Vec8s(((int16_t*)table)[ii[0]], ((int16_t*)table)[ii[1]], ((int16_t*)table)[ii[2]], ((int16_t*)table)[ii[3]], ((int16_t*)table)[ii[4]], ((int16_t*)table)[ii[5]], ((int16_t*)table)[ii[6]], ((int16_t*)table)[ii[7]]); #endif } static inline Vec4i lookup4(Vec4i const & index, Vec4i const & table) { #if INSTRSET >= 5 // SSSE3 return _mm_shuffle_epi8(table, index * 0x04040404 + 0x03020100); #else return Vec4i(table[index[0]],table[index[1]],table[index[2]],table[index[3]]); #endif } static inline Vec4i lookup8(Vec4i const & index, Vec4i const & table0, Vec4i const & table1) { // return Vec4i(lookup16(Vec8s(index * 0x20002 + 0x10000), Vec8s(table0), Vec8s(table1))); #ifdef __XOP__ // AMD XOP instruction set. Use VPPERM return _mm_perm_epi8(table0, table1, index * 0x04040404 + 0x03020100); #elif INSTRSET >= 8 // AVX2. Use VPERMD __m256i table01 = _mm256_inserti128_si256(_mm256_castsi128_si256(table0), table1, 1); // join tables into 256 bit vector #if defined (_MSC_VER) && _MSC_VER < 1700 && ! defined(__INTEL_COMPILER) // bug in MS VS 11 beta: operands in wrong order return _mm256_castsi256_si128(_mm256_permutevar8x32_epi32(_mm256_castsi128_si256(index), table01)); #elif defined (GCC_VERSION) && GCC_VERSION <= 40700 && !defined(__INTEL_COMPILER) && !defined(__clang__) // Gcc 4.7.0 also has operands in wrong order return _mm256_castsi256_si128(_mm256_permutevar8x32_epi32(_mm256_castsi128_si256(index), table01)); #else return _mm256_castsi256_si128(_mm256_permutevar8x32_epi32(table01, _mm256_castsi128_si256(index))); #endif // bug #elif INSTRSET >= 4 // SSSE3 Vec4i r0 = _mm_shuffle_epi8(table0, Vec16c(index * 0x04040404) + Vec16c(Vec4i(0x73727170))); Vec4i r1 = _mm_shuffle_epi8(table1, Vec16c(index * 0x04040404 ^ 0x10101010) + Vec16c(Vec4i(0x73727170))); return r0 | r1; #else // SSE2 int32_t ii[4], tt[8], rr[4]; table0.store(tt); table1.store(tt+4); index.store(ii); for (int j = 0; j < 4; j++) rr[j] = tt[ii[j] & 0x07]; return Vec4i().load(rr); #endif } static inline Vec4i lookup16(Vec4i const & index, Vec4i const & table0, Vec4i const & table1, Vec4i const & table2, Vec4i const & table3) { #if INSTRSET >= 8 // AVX2. Use VPERMD __m256i table01 = _mm256_inserti128_si256(_mm256_castsi128_si256(table0), table1, 1); // join tables into 256 bit vector __m256i table23 = _mm256_inserti128_si256(_mm256_castsi128_si256(table2), table3, 1); // join tables into 256 bit vector #if defined (_MSC_VER) && _MSC_VER < 1700 && ! defined(__INTEL_COMPILER) // bug in MS VS 11 beta: operands in wrong order __m128i r0 = _mm256_castsi256_si128(_mm256_permutevar8x32_epi32(_mm256_castsi128_si256(index ), table01)); __m128i r1 = _mm256_castsi256_si128(_mm256_permutevar8x32_epi32(_mm256_castsi128_si256(index ^ 8), table23)); #elif defined (GCC_VERSION) && GCC_VERSION <= 40700 && !defined(__INTEL_COMPILER) && !defined(__clang__) // Gcc 4.7.0 also has operands in wrong order __m128i r0 = _mm256_castsi256_si128(_mm256_permutevar8x32_epi32(_mm256_castsi128_si256(index ), table01)); __m128i r1 = _mm256_castsi256_si128(_mm256_permutevar8x32_epi32(_mm256_castsi128_si256(index ^ 8), table23)); #else __m128i r0 = _mm256_castsi256_si128(_mm256_permutevar8x32_epi32(table01, _mm256_castsi128_si256(index))); __m128i r1 = _mm256_castsi256_si128(_mm256_permutevar8x32_epi32(table23, _mm256_castsi128_si256(index ^ 8))); #endif // bug return _mm_blendv_epi8(r0, r1, index > 8); #elif defined (__XOP__) // AMD XOP instruction set. Use VPPERM Vec4i r0 = _mm_perm_epi8(table0, table1, ((index ) * 0x04040404u + 0x63626160u) & 0X9F9F9F9Fu); Vec4i r1 = _mm_perm_epi8(table2, table3, ((index ^ 8) * 0x04040404u + 0x63626160u) & 0X9F9F9F9Fu); return r0 | r1; #elif INSTRSET >= 5 // SSSE3 Vec16c aa = Vec16c(Vec4i(0x73727170)); Vec4i r0 = _mm_shuffle_epi8(table0, Vec16c((index ) * 0x04040404) + aa); Vec4i r1 = _mm_shuffle_epi8(table1, Vec16c((index ^ 4) * 0x04040404) + aa); Vec4i r2 = _mm_shuffle_epi8(table2, Vec16c((index ^ 8) * 0x04040404) + aa); Vec4i r3 = _mm_shuffle_epi8(table3, Vec16c((index ^ 12) * 0x04040404) + aa); return (r0 | r1) | (r2 | r3); #else // SSE2 int32_t ii[4], tt[16], rr[4]; table0.store(tt); table1.store(tt+4); table2.store(tt+8); table3.store(tt+12); index.store(ii); for (int j = 0; j < 4; j++) rr[j] = tt[ii[j] & 0x0F]; return Vec4i().load(rr); #endif } template static inline Vec4i lookup(Vec4i const & index, void const * table) { if (n <= 0) return 0; if (n <= 4) return lookup4(index, Vec4i().load(table)); if (n <= 8) return lookup8(index, Vec4i().load(table), Vec4i().load((int32_t*)table + 4)); // n > 8. Limit index Vec4ui index1; if ((n & (n-1)) == 0) { // n is a power of 2, make index modulo n index1 = Vec4ui(index) & (n-1); } else { // n is not a power of 2, limit to n-1 index1 = min(Vec4ui(index), n-1); } #if INSTRSET >= 8 // AVX2. Use VPERMD return _mm_i32gather_epi32((const int *)table, index1, 4); #else uint32_t ii[4]; index1.store(ii); return Vec4i(((int32_t*)table)[ii[0]], ((int32_t*)table)[ii[1]], ((int32_t*)table)[ii[2]], ((int32_t*)table)[ii[3]]); #endif } static inline Vec2q lookup2(Vec2q const & index, Vec2q const & table) { #if INSTRSET >= 5 // SSSE3 return _mm_shuffle_epi8(table, index * 0x0808080808080808ll + 0x0706050403020100ll); #else int64_t ii[2], tt[2]; table.store(tt); index.store(ii); return Vec2q(tt[int(ii[0])], tt[int(ii[1])]); #endif } template static inline Vec2q lookup(Vec2q const & index, void const * table) { if (n <= 0) return 0; // n > 0. Limit index Vec2uq index1; if ((n & (n-1)) == 0) { // n is a power of 2, make index modulo n index1 = Vec2uq(index) & (n-1); } else { // n is not a power of 2, limit to n-1. // There is no 64-bit min instruction, but we can use the 32-bit unsigned min, // since n is a 32-bit integer index1 = Vec2uq(min(Vec2uq(index), constant4i())); } uint32_t ii[4]; index1.store(ii); // use only lower 32 bits of each index int64_t const * tt = (int64_t const *)table; return Vec2q(tt[ii[0]], tt[ii[2]]); } /***************************************************************************** * * Other permutations with variable indexes * *****************************************************************************/ // Function shift_bytes_up: shift whole vector left by b bytes. // You may use a permute function instead if b is a compile-time constant static inline Vec16c shift_bytes_up(Vec16c const & a, int b) { if ((uint32_t)b > 15) return _mm_setzero_si128(); #if INSTRSET >= 4 // SSSE3 static const char mask[32] = {-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}; return Vec16c(_mm_shuffle_epi8(a, Vec16c().load(mask+16-b))); #else Vec2uq a1 = Vec2uq(a); if (b < 8) { a1 = (a1 << (b*8)) | (permute2uq<-1,0>(a1) >> (64 - (b*8))); } else { a1 = permute2uq<-1,0>(a1) << ((b-8)*8); } return Vec16c(a1); #endif } // Function shift_bytes_down: shift whole vector right by b bytes // You may use a permute function instead if b is a compile-time constant static inline Vec16c shift_bytes_down(Vec16c const & a, int b) { if ((uint32_t)b > 15) return _mm_setzero_si128(); #if INSTRSET >= 4 // SSSE3 static const char mask[32] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1}; return Vec16c(_mm_shuffle_epi8(a, Vec16c().load(mask+b))); #else Vec2uq a1 = Vec2uq(a); if (b < 8) { a1 = (a1 >> (b*8)) | (permute2uq<1,-1>(a1) << (64 - (b*8))); } else { a1 = permute2uq<1,-1>(a1) >> ((b-8)*8); } return Vec16c(a1); #endif } /***************************************************************************** * * Gather functions with fixed indexes * *****************************************************************************/ // Load elements from array a with indices i0, i1, i2, i3 template static inline Vec4i gather4i(void const * a) { Static_error_check<(i0|i1|i2|i3)>=0> Negative_array_index; // Error message if index is negative const int i01min = i0 < i1 ? i0 : i1; const int i23min = i2 < i3 ? i2 : i3; const int imin = i01min < i23min ? i01min : i23min; const int i01max = i0 > i1 ? i0 : i1; const int i23max = i2 > i3 ? i2 : i3; const int imax = i01max > i23max ? i01max : i23max; if (imax - imin <= 3) { // load one contiguous block and permute if (imax > 3) { // make sure we don't read past the end of the array Vec4i b = Vec4i().load((int32_t const *)a + imax-3); return permute4i(b); } else { Vec4i b = Vec4i().load((int32_t const *)a + imin); return permute4i(b); } } if ((i0imax-4) && (i1imax-4) && (i2imax-4) && (i3imax-4)) { // load two contiguous blocks and blend Vec4i b = Vec4i().load((int32_t const *)a + imin); Vec4i c = Vec4i().load((int32_t const *)a + imax-3); const int j0 = i0(b, c); } // use AVX2 gather if available #if INSTRSET >= 8 return _mm_i32gather_epi32((const int *)a, Vec4i(i0,i1,i2,i3), 4); #else return lookup(Vec4i(i0,i1,i2,i3), a); #endif } // Load elements from array a with indices i0, i1 template static inline Vec2q gather2q(void const * a) { Static_error_check<(i0|i1)>=0> Negative_array_index; // Error message if index is negative const int imin = i0 < i1 ? i0 : i1; const int imax = i0 > i1 ? i0 : i1; if (imax - imin <= 1) { // load one contiguous block and permute if (imax > 1) { // make sure we don't read past the end of the array Vec2q b = Vec2q().load((int64_t const *)a + imax-1); return permute2q(b); } else { Vec2q b = Vec2q().load((int64_t const *)a + imin); return permute2q(b); } } return Vec2q(((int64_t*)a)[i0], ((int64_t*)a)[i1]); } /***************************************************************************** * * Functions for conversion between integer sizes * *****************************************************************************/ // Extend 8-bit integers to 16-bit integers, signed and unsigned // Function extend_low : extends the low 8 elements to 16 bits with sign extension static inline Vec8s extend_low (Vec16c const & a) { __m128i sign = _mm_cmpgt_epi8(_mm_setzero_si128(),a); // 0 > a return _mm_unpacklo_epi8(a,sign); // interleave with sign extensions } // Function extend_high : extends the high 8 elements to 16 bits with sign extension static inline Vec8s extend_high (Vec16c const & a) { __m128i sign = _mm_cmpgt_epi8(_mm_setzero_si128(),a); // 0 > a return _mm_unpackhi_epi8(a,sign); // interleave with sign extensions } // Function extend_low : extends the low 8 elements to 16 bits with zero extension static inline Vec8us extend_low (Vec16uc const & a) { return _mm_unpacklo_epi8(a,_mm_setzero_si128()); // interleave with zero extensions } // Function extend_high : extends the high 8 elements to 16 bits with zero extension static inline Vec8us extend_high (Vec16uc const & a) { return _mm_unpackhi_epi8(a,_mm_setzero_si128()); // interleave with zero extensions } // Extend 16-bit integers to 32-bit integers, signed and unsigned // Function extend_low : extends the low 4 elements to 32 bits with sign extension static inline Vec4i extend_low (Vec8s const & a) { __m128i sign = _mm_srai_epi16(a,15); // sign bit return _mm_unpacklo_epi16(a,sign); // interleave with sign extensions } // Function extend_high : extends the high 4 elements to 32 bits with sign extension static inline Vec4i extend_high (Vec8s const & a) { __m128i sign = _mm_srai_epi16(a,15); // sign bit return _mm_unpackhi_epi16(a,sign); // interleave with sign extensions } // Function extend_low : extends the low 4 elements to 32 bits with zero extension static inline Vec4ui extend_low (Vec8us const & a) { return _mm_unpacklo_epi16(a,_mm_setzero_si128()); // interleave with zero extensions } // Function extend_high : extends the high 4 elements to 32 bits with zero extension static inline Vec4ui extend_high (Vec8us const & a) { return _mm_unpackhi_epi16(a,_mm_setzero_si128()); // interleave with zero extensions } // Extend 32-bit integers to 64-bit integers, signed and unsigned // Function extend_low : extends the low 2 elements to 64 bits with sign extension static inline Vec2q extend_low (Vec4i const & a) { __m128i sign = _mm_srai_epi32(a,31); // sign bit return _mm_unpacklo_epi32(a,sign); // interleave with sign extensions } // Function extend_high : extends the high 2 elements to 64 bits with sign extension static inline Vec2q extend_high (Vec4i const & a) { __m128i sign = _mm_srai_epi32(a,31); // sign bit return _mm_unpackhi_epi32(a,sign); // interleave with sign extensions } // Function extend_low : extends the low 2 elements to 64 bits with zero extension static inline Vec2uq extend_low (Vec4ui const & a) { return _mm_unpacklo_epi32(a,_mm_setzero_si128()); // interleave with zero extensions } // Function extend_high : extends the high 2 elements to 64 bits with zero extension static inline Vec2uq extend_high (Vec4ui const & a) { return _mm_unpackhi_epi32(a,_mm_setzero_si128()); // interleave with zero extensions } // Compress 16-bit integers to 8-bit integers, signed and unsigned, with and without saturation // Function compress : packs two vectors of 16-bit integers into one vector of 8-bit integers // Overflow wraps around static inline Vec16c compress (Vec8s const & low, Vec8s const & high) { __m128i mask = _mm_set1_epi32(0x00FF00FF); // mask for low bytes __m128i lowm = _mm_and_si128(low,mask); // bytes of low __m128i highm = _mm_and_si128(high,mask); // bytes of high return _mm_packus_epi16(lowm,highm); // unsigned pack } // Function compress : packs two vectors of 16-bit integers into one vector of 8-bit integers // Signed, with saturation static inline Vec16c compress_saturated (Vec8s const & low, Vec8s const & high) { return _mm_packs_epi16(low,high); } // Function compress : packs two vectors of 16-bit integers to one vector of 8-bit integers // Unsigned, overflow wraps around static inline Vec16uc compress (Vec8us const & low, Vec8us const & high) { return Vec16uc (compress((Vec8s)low, (Vec8s)high)); } // Function compress : packs two vectors of 16-bit integers into one vector of 8-bit integers // Unsigned, with saturation static inline Vec16uc compress_saturated (Vec8us const & low, Vec8us const & high) { #if INSTRSET >= 5 // SSE4.1 supported __m128i maxval = _mm_set1_epi32(0x00FF00FF); // maximum value __m128i minval = _mm_setzero_si128(); // minimum value = 0 __m128i low1 = _mm_min_epu16(low,maxval); // upper limit __m128i high1 = _mm_min_epu16(high,maxval); // upper limit __m128i low2 = _mm_max_epu16(low1,minval); // lower limit __m128i high2 = _mm_max_epu16(high1,minval); // lower limit return _mm_packus_epi16(low2,high2); // this instruction saturates from signed 32 bit to unsigned 16 bit #else __m128i zero = _mm_setzero_si128(); // 0 __m128i signlow = _mm_cmpgt_epi16(zero,low); // sign bit of low __m128i signhi = _mm_cmpgt_epi16(zero,high); // sign bit of high __m128i slow2 = _mm_srli_epi16(signlow,8); // FF if low negative __m128i shigh2 = _mm_srli_epi16(signhi,8); // FF if high negative __m128i maskns = _mm_set1_epi32(0x7FFF7FFF); // mask for removing sign bit __m128i lowns = _mm_and_si128(low,maskns); // low, with sign bit removed __m128i highns = _mm_and_si128(high,maskns); // high, with sign bit removed __m128i lowo = _mm_or_si128(lowns,slow2); // low, sign bit replaced by 00FF __m128i higho = _mm_or_si128(highns,shigh2); // high, sign bit replaced by 00FF return _mm_packus_epi16(lowo,higho); // this instruction saturates from signed 16 bit to unsigned 8 bit #endif } // Compress 32-bit integers to 16-bit integers, signed and unsigned, with and without saturation // Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers // Overflow wraps around static inline Vec8s compress (Vec4i const & low, Vec4i const & high) { #if INSTRSET >= 5 // SSE4.1 supported __m128i mask = _mm_set1_epi32(0x0000FFFF); // mask for low words __m128i lowm = _mm_and_si128(low,mask); // bytes of low __m128i highm = _mm_and_si128(high,mask); // bytes of high return _mm_packus_epi32(lowm,highm); // unsigned pack #else __m128i low1 = _mm_shufflelo_epi16(low,0xD8); // low words in place __m128i high1 = _mm_shufflelo_epi16(high,0xD8); // low words in place __m128i low2 = _mm_shufflehi_epi16(low1,0xD8); // low words in place __m128i high2 = _mm_shufflehi_epi16(high1,0xD8); // low words in place __m128i low3 = _mm_shuffle_epi32(low2,0xD8); // low dwords of low to pos. 0 and 32 __m128i high3 = _mm_shuffle_epi32(high2,0xD8); // low dwords of high to pos. 0 and 32 return _mm_unpacklo_epi64(low3,high3); // interleave #endif } // Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers // Signed with saturation static inline Vec8s compress_saturated (Vec4i const & low, Vec4i const & high) { return _mm_packs_epi32(low,high); // pack with signed saturation } // Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers // Overflow wraps around static inline Vec8us compress (Vec4ui const & low, Vec4ui const & high) { return Vec8us (compress((Vec4i)low, (Vec4i)high)); } // Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers // Unsigned, with saturation static inline Vec8us compress_saturated (Vec4ui const & low, Vec4ui const & high) { #if INSTRSET >= 5 // SSE4.1 supported __m128i maxval = _mm_set1_epi32(0x0000FFFF); // maximum value __m128i minval = _mm_setzero_si128(); // minimum value = 0 __m128i low1 = _mm_min_epu32(low,maxval); // upper limit __m128i high1 = _mm_min_epu32(high,maxval); // upper limit __m128i low2 = _mm_max_epu32(low1,minval); // lower limit __m128i high2 = _mm_max_epu32(high1,minval); // lower limit return _mm_packus_epi32(low2,high2); // this instruction saturates from signed 32 bit to unsigned 16 bit #else __m128i zero = _mm_setzero_si128(); // 0 __m128i lowzero = _mm_cmpeq_epi16(low,zero); // for each word is zero __m128i highzero = _mm_cmpeq_epi16(high,zero); // for each word is zero __m128i mone = _mm_set1_epi32(-1); // FFFFFFFF __m128i lownz = _mm_xor_si128(lowzero,mone); // for each word is nonzero __m128i highnz = _mm_xor_si128(highzero,mone); // for each word is nonzero __m128i lownz2 = _mm_srli_epi32(lownz,16); // shift down to low dword __m128i highnz2 = _mm_srli_epi32(highnz,16); // shift down to low dword __m128i lowsatur = _mm_or_si128(low,lownz2); // low, saturated __m128i hisatur = _mm_or_si128(high,highnz2); // high, saturated return Vec8us (compress(Vec4i(lowsatur), Vec4i(hisatur))); #endif } // Compress 64-bit integers to 32-bit integers, signed and unsigned, with and without saturation // Function compress : packs two vectors of 64-bit integers into one vector of 32-bit integers // Overflow wraps around static inline Vec4i compress (Vec2q const & low, Vec2q const & high) { __m128i low2 = _mm_shuffle_epi32(low,0xD8); // low dwords of low to pos. 0 and 32 __m128i high2 = _mm_shuffle_epi32(high,0xD8); // low dwords of high to pos. 0 and 32 return _mm_unpacklo_epi64(low2,high2); // interleave } // Function compress : packs two vectors of 64-bit integers into one vector of 32-bit integers // Signed, with saturation // This function is very inefficient unless the SSE4.2 instruction set is supported static inline Vec4i compress_saturated (Vec2q const & low, Vec2q const & high) { Vec2q maxval = _mm_set_epi32(0,0x7FFFFFFF,0,0x7FFFFFFF); Vec2q minval = _mm_set_epi32(-1,0x80000000,-1,0x80000000); Vec2q low1 = min(low,maxval); Vec2q high1 = min(high,maxval); Vec2q low2 = max(low1,minval); Vec2q high2 = max(high1,minval); return compress(low2,high2); } // Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers // Overflow wraps around static inline Vec4ui compress (Vec2uq const & low, Vec2uq const & high) { return Vec4ui (compress((Vec2q)low, (Vec2q)high)); } // Function compress : packs two vectors of 64-bit integers into one vector of 32-bit integers // Unsigned, with saturation static inline Vec4ui compress_saturated (Vec2uq const & low, Vec2uq const & high) { __m128i zero = _mm_setzero_si128(); // 0 __m128i lowzero = _mm_cmpeq_epi32(low,zero); // for each dword is zero __m128i highzero = _mm_cmpeq_epi32(high,zero); // for each dword is zero __m128i mone = _mm_set1_epi32(-1); // FFFFFFFF __m128i lownz = _mm_xor_si128(lowzero,mone); // for each dword is nonzero __m128i highnz = _mm_xor_si128(highzero,mone); // for each dword is nonzero __m128i lownz2 = _mm_srli_epi64(lownz,32); // shift down to low dword __m128i highnz2 = _mm_srli_epi64(highnz,32); // shift down to low dword __m128i lowsatur = _mm_or_si128(low,lownz2); // low, saturated __m128i hisatur = _mm_or_si128(high,highnz2); // high, saturated return Vec4ui (compress(Vec2q(lowsatur), Vec2q(hisatur))); } /***************************************************************************** * * Helper functions for division and bit scan * *****************************************************************************/ // Define popcount function. Gives sum of bits #if INSTRSET >= 6 // SSE4.2 // popcnt instruction is not officially part of the SSE4.2 instruction set, // but available in all known processors with SSE4.2 #if defined (__GNUC__) || defined(__clang__) static inline uint32_t vml_popcnt (uint32_t a) __attribute__ ((pure)); static inline uint32_t vml_popcnt (uint32_t a) { uint32_t r; __asm("popcnt %1, %0" : "=r"(r) : "r"(a) : ); return r; } #else static inline uint32_t vml_popcnt (uint32_t a) { return _mm_popcnt_u32(a); // MS intrinsic } #endif // platform #else // no SSE4.2 static inline uint32_t vml_popcnt (uint32_t a) { // popcnt instruction not available uint32_t b = a - ((a >> 1) & 0x55555555); uint32_t c = (b & 0x33333333) + ((b >> 2) & 0x33333333); uint32_t d = (c + (c >> 4)) & 0x0F0F0F0F; uint32_t e = d * 0x01010101; return e >> 24; } #endif // Define bit-scan-forward function. Gives index to lowest set bit #if defined (__GNUC__) || defined(__clang__) static inline uint32_t bit_scan_reverse (uint32_t a) __attribute__ ((pure)); static inline uint32_t bit_scan_forward (uint32_t a) { uint32_t r; __asm("bsfl %1, %0" : "=r"(r) : "r"(a) : ); return r; } #else static inline uint32_t bit_scan_forward (uint32_t a) { unsigned long r; _BitScanForward(&r, a); // defined in intrin.h for MS and Intel compilers return r; } #endif // Define bit-scan-reverse function. Gives index to highest set bit = floor(log2(a)) #if defined (__GNUC__) || defined(__clang__) static inline uint32_t bit_scan_reverse (uint32_t a) __attribute__ ((pure)); static inline uint32_t bit_scan_reverse (uint32_t a) { uint32_t r; __asm("bsrl %1, %0" : "=r"(r) : "r"(a) : ); return r; } #else static inline uint32_t bit_scan_reverse (uint32_t a) { unsigned long r; _BitScanReverse(&r, a); // defined in intrin.h for MS and Intel compilers return r; } #endif // Same function, for compile-time constants. // We need template metaprogramming for calculating this function at compile time. // This may take a long time to compile because of the template recursion. // Todo: replace this with a constexpr function when C++14 becomes available template struct BitScanR { enum {val = ( n >= 0x10 ? 4 + (BitScanR<(n>>4)>::val) : n < 2 ? 0 : n < 4 ? 1 : n < 8 ? 2 : 3 ) }; }; template <> struct BitScanR<0> {enum {val = 0};}; // Avoid infinite template recursion #define bit_scan_reverse_const(n) (BitScanR::val) // n must be a valid compile-time constant /***************************************************************************** * * Integer division operators * ****************************************************************************** * * The instruction set does not support integer vector division. Instead, we * are using a method for fast integer division based on multiplication and * shift operations. This method is faster than simple integer division if the * same divisor is used multiple times. * * All elements in a vector are divided by the same divisor. It is not possible * to divide different elements of the same vector by different divisors. * * The parameters used for fast division are stored in an object of a * Divisor class. This object can be created implicitly, for example in: * Vec4i a, b; int c; * a = b / c; * or explicitly as: * a = b / Divisor_i(c); * * It takes more time to compute the parameters used for fast division than to * do the division. Therefore, it is advantageous to use the same divisor object * multiple times. For example, to divide 80 unsigned short integers by 10: * * uint16_t dividends[80], quotients[80]; // numbers to work with * Divisor_us div10(10); // make divisor object for dividing by 10 * Vec8us temp; // temporary vector * for (int i = 0; i < 80; i += 8) { // loop for 4 elements per iteration * temp.load(dividends+i); // load 4 elements * temp /= div10; // divide each element by 10 * temp.store(quotients+i); // store 4 elements * } * * The parameters for fast division can also be computed at compile time. This is * an advantage if the divisor is known at compile time. Use the const_int or const_uint * macro to do this. For example, for signed integers: * Vec8s a, b; * a = b / const_int(10); * Or, for unsigned integers: * Vec8us a, b; * a = b / const_uint(10); * * The division of a vector of 16-bit integers is faster than division of a vector * of other integer sizes. * * * Mathematical formula, used for signed division with fixed or variable divisor: * (From T. Granlund and P. L. Montgomery: Division by Invariant Integers Using Multiplication, * Proceedings of the SIGPLAN 1994 Conference on Programming Language Design and Implementation. * http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.1.2556 ) * x = dividend * d = abs(divisor) * w = integer word size, bits * L = ceil(log2(d)) = bit_scan_reverse(d-1)+1 * L = max(L,1) * m = 1 + 2^(w+L-1)/d - 2^w [division should overflow to 0 if d = 1] * sh1 = L-1 * q = x + (m*x >> w) [high part of signed multiplication with 2w bits] * q = (q >> sh1) - (x<0 ? -1 : 0) * if (divisor < 0) q = -q * result trunc(x/d) = q * * Mathematical formula, used for unsigned division with variable divisor: * (Also from T. Granlund and P. L. Montgomery) * x = dividend * d = divisor * w = integer word size, bits * L = ceil(log2(d)) = bit_scan_reverse(d-1)+1 * m = 1 + 2^w * (2^L-d) / d [2^L should overflow to 0 if L = w] * sh1 = min(L,1) * sh2 = max(L-1,0) * t = m*x >> w [high part of unsigned multiplication with 2w bits] * result floor(x/d) = (((x-t) >> sh1) + t) >> sh2 * * Mathematical formula, used for unsigned division with fixed divisor: * (From Terje Mathisen, unpublished) * x = dividend * d = divisor * w = integer word size, bits * b = floor(log2(d)) = bit_scan_reverse(d) * f = 2^(w+b) / d [exact division] * If f is an integer then d is a power of 2 then go to case A * If the fractional part of f is < 0.5 then go to case B * If the fractional part of f is > 0.5 then go to case C * Case A: [shift only] * result = x >> b * Case B: [round down f and compensate by adding one to x] * result = ((x+1)*floor(f)) >> (w+b) [high part of unsigned multiplication with 2w bits] * Case C: [round up f, no compensation for rounding error] * result = (x*ceil(f)) >> (w+b) [high part of unsigned multiplication with 2w bits] * * *****************************************************************************/ // encapsulate parameters for fast division on vector of 4 32-bit signed integers class Divisor_i { protected: __m128i multiplier; // multiplier used in fast division __m128i shift1; // shift count used in fast division __m128i sign; // sign of divisor public: Divisor_i() {}; // Default constructor Divisor_i(int32_t d) { // Constructor with divisor set(d); } Divisor_i(int m, int s1, int sgn) { // Constructor with precalculated multiplier, shift and sign multiplier = _mm_set1_epi32(m); shift1 = _mm_cvtsi32_si128(s1); sign = _mm_set1_epi32(sgn); } void set(int32_t d) { // Set or change divisor, calculate parameters const int32_t d1 = abs(d); int32_t sh, m; if (d1 > 1) { sh = bit_scan_reverse(d1-1); // shift count = ceil(log2(d1))-1 = (bit_scan_reverse(d1-1)+1)-1 m = int32_t((int64_t(1) << (32+sh)) / d1 - ((int64_t(1) << 32) - 1)); // calculate multiplier } else { m = 1; // for d1 = 1 sh = 0; if (d == 0) m /= d; // provoke error here if d = 0 if (uint32_t(d) == 0x80000000u) { // fix overflow for this special case m = 0x80000001; sh = 30; } } multiplier = _mm_set1_epi32(m); // broadcast multiplier shift1 = _mm_setr_epi32(sh, 0, 0, 0); // shift count sign = _mm_set1_epi32(d < 0 ? -1 : 0); // sign of divisor } __m128i getm() const { // get multiplier return multiplier; } __m128i gets1() const { // get shift count return shift1; } __m128i getsign() const { // get sign of divisor return sign; } }; // encapsulate parameters for fast division on vector of 4 32-bit unsigned integers class Divisor_ui { protected: __m128i multiplier; // multiplier used in fast division __m128i shift1; // shift count 1 used in fast division __m128i shift2; // shift count 2 used in fast division public: Divisor_ui() {}; // Default constructor Divisor_ui(uint32_t d) { // Constructor with divisor set(d); } Divisor_ui(uint32_t m, int s1, int s2) { // Constructor with precalculated multiplier and shifts multiplier = _mm_set1_epi32(m); shift1 = _mm_setr_epi32(s1, 0, 0, 0); shift2 = _mm_setr_epi32(s2, 0, 0, 0); } void set(uint32_t d) { // Set or change divisor, calculate parameters uint32_t L, L2, sh1, sh2, m; switch (d) { case 0: m = sh1 = sh2 = 1 / d; // provoke error for d = 0 break; case 1: m = 1; sh1 = sh2 = 0; // parameters for d = 1 break; case 2: m = 1; sh1 = 1; sh2 = 0; // parameters for d = 2 break; default: // general case for d > 2 L = bit_scan_reverse(d-1)+1; // ceil(log2(d)) L2 = L < 32 ? 1 << L : 0; // 2^L, overflow to 0 if L = 32 m = 1 + uint32_t((uint64_t(L2 - d) << 32) / d); // multiplier sh1 = 1; sh2 = L - 1; // shift counts } multiplier = _mm_set1_epi32(m); shift1 = _mm_setr_epi32(sh1, 0, 0, 0); shift2 = _mm_setr_epi32(sh2, 0, 0, 0); } __m128i getm() const { // get multiplier return multiplier; } __m128i gets1() const { // get shift count 1 return shift1; } __m128i gets2() const { // get shift count 2 return shift2; } }; // encapsulate parameters for fast division on vector of 8 16-bit signed integers class Divisor_s { protected: __m128i multiplier; // multiplier used in fast division __m128i shift1; // shift count used in fast division __m128i sign; // sign of divisor public: Divisor_s() {}; // Default constructor Divisor_s(int16_t d) { // Constructor with divisor set(d); } Divisor_s(int16_t m, int s1, int sgn) { // Constructor with precalculated multiplier, shift and sign multiplier = _mm_set1_epi16(m); shift1 = _mm_setr_epi32(s1, 0, 0, 0); sign = _mm_set1_epi32(sgn); } void set(int16_t d) { // Set or change divisor, calculate parameters const int32_t d1 = abs(d); int32_t sh, m; if (d1 > 1) { sh = bit_scan_reverse(d1-1); // shift count = ceil(log2(d1))-1 = (bit_scan_reverse(d1-1)+1)-1 m = ((int32_t(1) << (16+sh)) / d1 - ((int32_t(1) << 16) - 1)); // calculate multiplier } else { m = 1; // for d1 = 1 sh = 0; if (d == 0) m /= d; // provoke error here if d = 0 if (uint16_t(d) == 0x8000u) { // fix overflow for this special case m = 0x8001; sh = 14; } } multiplier = _mm_set1_epi16(int16_t(m)); // broadcast multiplier shift1 = _mm_setr_epi32(sh, 0, 0, 0); // shift count sign = _mm_set1_epi32(d < 0 ? -1 : 0); // sign of divisor } __m128i getm() const { // get multiplier return multiplier; } __m128i gets1() const { // get shift count return shift1; } __m128i getsign() const { // get sign of divisor return sign; } }; // encapsulate parameters for fast division on vector of 8 16-bit unsigned integers class Divisor_us { protected: __m128i multiplier; // multiplier used in fast division __m128i shift1; // shift count 1 used in fast division __m128i shift2; // shift count 2 used in fast division public: Divisor_us() {}; // Default constructor Divisor_us(uint16_t d) { // Constructor with divisor set(d); } Divisor_us(uint16_t m, int s1, int s2) { // Constructor with precalculated multiplier and shifts multiplier = _mm_set1_epi16(m); shift1 = _mm_setr_epi32(s1, 0, 0, 0); shift2 = _mm_setr_epi32(s2, 0, 0, 0); } void set(uint16_t d) { // Set or change divisor, calculate parameters uint16_t L, L2, sh1, sh2, m; switch (d) { case 0: m = sh1 = sh2 = 1 / d; // provoke error for d = 0 break; case 1: m = 1; sh1 = sh2 = 0; // parameters for d = 1 break; case 2: m = 1; sh1 = 1; sh2 = 0; // parameters for d = 2 break; default: // general case for d > 2 L = (uint16_t)bit_scan_reverse(d-1)+1; // ceil(log2(d)) L2 = uint16_t(1 << L); // 2^L, overflow to 0 if L = 16 m = 1 + uint16_t((uint32_t(L2 - d) << 16) / d); // multiplier sh1 = 1; sh2 = L - 1; // shift counts } multiplier = _mm_set1_epi16(m); shift1 = _mm_setr_epi32(sh1, 0, 0, 0); shift2 = _mm_setr_epi32(sh2, 0, 0, 0); } __m128i getm() const { // get multiplier return multiplier; } __m128i gets1() const { // get shift count 1 return shift1; } __m128i gets2() const { // get shift count 2 return shift2; } }; // vector operator / : divide each element by divisor // vector of 4 32-bit signed integers static inline Vec4i operator / (Vec4i const & a, Divisor_i const & d) { #if defined (__XOP__) && defined (GCC_VERSION) && GCC_VERSION <= 40702/*??*/ && !defined(__INTEL_COMPILER) && !defined(__clang__) #define XOP_MUL_BUG // GCC has bug in XOP multiply // Bug found in GCC version 4.7.0 and 4.7.1 #endif // todo: test this when GCC bug is fixed #if defined (__XOP__) && !defined (XOP_MUL_BUG) __m128i t1 = _mm_mul_epi32(a,d.getm()); // 32x32->64 bit signed multiplication of a[0] and a[2] __m128i t2 = _mm_srli_epi64(t1,32); // high dword of result 0 and 2 __m128i t3 = _mm_macchi_epi32(a,d.getm(),_mm_setzero_si128());// 32x32->64 bit signed multiplication of a[1] and a[3] __m128i t5 = _mm_set_epi32(-1,0,-1,0); // mask of dword 1 and 3 __m128i t7 = _mm_blendv_epi8(t2,t3,t5); // blend two results __m128i t8 = _mm_add_epi32(t7,a); // add __m128i t9 = _mm_sra_epi32(t8,d.gets1()); // shift right arithmetic __m128i t10 = _mm_srai_epi32(a,31); // sign of a __m128i t11 = _mm_sub_epi32(t10,d.getsign()); // sign of a - sign of d __m128i t12 = _mm_sub_epi32(t9,t11); // + 1 if a < 0, -1 if d < 0 return _mm_xor_si128(t12,d.getsign()); // change sign if divisor negative #elif INSTRSET >= 5 && !defined (XOP_MUL_BUG) // SSE4.1 supported __m128i t1 = _mm_mul_epi32(a,d.getm()); // 32x32->64 bit signed multiplication of a[0] and a[2] __m128i t2 = _mm_srli_epi64(t1,32); // high dword of result 0 and 2 __m128i t3 = _mm_srli_epi64(a,32); // get a[1] and a[3] into position for multiplication __m128i t4 = _mm_mul_epi32(t3,d.getm()); // 32x32->64 bit signed multiplication of a[1] and a[3] __m128i t5 = _mm_set_epi32(-1,0,-1,0); // mask of dword 1 and 3 __m128i t7 = _mm_blendv_epi8(t2,t4,t5); // blend two results __m128i t8 = _mm_add_epi32(t7,a); // add __m128i t9 = _mm_sra_epi32(t8,d.gets1()); // shift right arithmetic __m128i t10 = _mm_srai_epi32(a,31); // sign of a __m128i t11 = _mm_sub_epi32(t10,d.getsign()); // sign of a - sign of d __m128i t12 = _mm_sub_epi32(t9,t11); // + 1 if a < 0, -1 if d < 0 return _mm_xor_si128(t12,d.getsign()); // change sign if divisor negative #else // not SSE4.1 __m128i t1 = _mm_mul_epu32(a,d.getm()); // 32x32->64 bit unsigned multiplication of a[0] and a[2] __m128i t2 = _mm_srli_epi64(t1,32); // high dword of result 0 and 2 __m128i t3 = _mm_srli_epi64(a,32); // get a[1] and a[3] into position for multiplication __m128i t4 = _mm_mul_epu32(t3,d.getm()); // 32x32->64 bit unsigned multiplication of a[1] and a[3] __m128i t5 = _mm_set_epi32(-1,0,-1,0); // mask of dword 1 and 3 __m128i t6 = _mm_and_si128(t4,t5); // high dword of result 1 and 3 __m128i t7 = _mm_or_si128(t2,t6); // combine all four results of unsigned high mul into one vector // convert unsigned to signed high multiplication (from: H S Warren: Hacker's delight, 2003, p. 132) __m128i u1 = _mm_srai_epi32(a,31); // sign of a __m128i u2 = _mm_srai_epi32(d.getm(),31); // sign of m [ m is always negative, except for abs(d) = 1 ] __m128i u3 = _mm_and_si128 (d.getm(),u1); // m * sign of a __m128i u4 = _mm_and_si128 (a,u2); // a * sign of m __m128i u5 = _mm_add_epi32 (u3,u4); // sum of sign corrections __m128i u6 = _mm_sub_epi32 (t7,u5); // high multiplication result converted to signed __m128i t8 = _mm_add_epi32(u6,a); // add a __m128i t9 = _mm_sra_epi32(t8,d.gets1()); // shift right arithmetic __m128i t10 = _mm_sub_epi32(u1,d.getsign()); // sign of a - sign of d __m128i t11 = _mm_sub_epi32(t9,t10); // + 1 if a < 0, -1 if d < 0 return _mm_xor_si128(t11,d.getsign()); // change sign if divisor negative #endif } // vector of 4 32-bit unsigned integers static inline Vec4ui operator / (Vec4ui const & a, Divisor_ui const & d) { __m128i t1 = _mm_mul_epu32(a,d.getm()); // 32x32->64 bit unsigned multiplication of a[0] and a[2] __m128i t2 = _mm_srli_epi64(t1,32); // high dword of result 0 and 2 __m128i t3 = _mm_srli_epi64(a,32); // get a[1] and a[3] into position for multiplication __m128i t4 = _mm_mul_epu32(t3,d.getm()); // 32x32->64 bit unsigned multiplication of a[1] and a[3] __m128i t5 = _mm_set_epi32(-1,0,-1,0); // mask of dword 1 and 3 #if INSTRSET >= 5 // SSE4.1 supported __m128i t7 = _mm_blendv_epi8(t2,t4,t5); // blend two results #else __m128i t6 = _mm_and_si128(t4,t5); // high dword of result 1 and 3 __m128i t7 = _mm_or_si128(t2,t6); // combine all four results into one vector #endif __m128i t8 = _mm_sub_epi32(a,t7); // subtract __m128i t9 = _mm_srl_epi32(t8,d.gets1()); // shift right logical __m128i t10 = _mm_add_epi32(t7,t9); // add return _mm_srl_epi32(t10,d.gets2()); // shift right logical } // vector of 8 16-bit signed integers static inline Vec8s operator / (Vec8s const & a, Divisor_s const & d) { __m128i t1 = _mm_mulhi_epi16(a, d.getm()); // multiply high signed words __m128i t2 = _mm_add_epi16(t1,a); // + a __m128i t3 = _mm_sra_epi16(t2,d.gets1()); // shift right arithmetic __m128i t4 = _mm_srai_epi16(a,15); // sign of a __m128i t5 = _mm_sub_epi16(t4,d.getsign()); // sign of a - sign of d __m128i t6 = _mm_sub_epi16(t3,t5); // + 1 if a < 0, -1 if d < 0 return _mm_xor_si128(t6,d.getsign()); // change sign if divisor negative } // vector of 8 16-bit unsigned integers static inline Vec8us operator / (Vec8us const & a, Divisor_us const & d) { __m128i t1 = _mm_mulhi_epu16(a, d.getm()); // multiply high unsigned words __m128i t2 = _mm_sub_epi16(a,t1); // subtract __m128i t3 = _mm_srl_epi16(t2,d.gets1()); // shift right logical __m128i t4 = _mm_add_epi16(t1,t3); // add return _mm_srl_epi16(t4,d.gets2()); // shift right logical } // vector of 16 8-bit signed integers static inline Vec16c operator / (Vec16c const & a, Divisor_s const & d) { // expand into two Vec8s Vec8s low = extend_low(a) / d; Vec8s high = extend_high(a) / d; return compress(low,high); } // vector of 16 8-bit unsigned integers static inline Vec16uc operator / (Vec16uc const & a, Divisor_us const & d) { // expand into two Vec8s Vec8us low = extend_low(a) / d; Vec8us high = extend_high(a) / d; return compress(low,high); } // vector operator /= : divide static inline Vec8s & operator /= (Vec8s & a, Divisor_s const & d) { a = a / d; return a; } // vector operator /= : divide static inline Vec8us & operator /= (Vec8us & a, Divisor_us const & d) { a = a / d; return a; } // vector operator /= : divide static inline Vec4i & operator /= (Vec4i & a, Divisor_i const & d) { a = a / d; return a; } // vector operator /= : divide static inline Vec4ui & operator /= (Vec4ui & a, Divisor_ui const & d) { a = a / d; return a; } // vector operator /= : divide static inline Vec16c & operator /= (Vec16c & a, Divisor_s const & d) { a = a / d; return a; } // vector operator /= : divide static inline Vec16uc & operator /= (Vec16uc & a, Divisor_us const & d) { a = a / d; return a; } /***************************************************************************** * * Integer division 2: divisor is a compile-time constant * *****************************************************************************/ // Divide Vec4i by compile-time constant template static inline Vec4i divide_by_i(Vec4i const & x) { Static_error_check<(d!=0)> Dividing_by_zero; // Error message if dividing by zero if (d == 1) return x; if (d == -1) return -x; if (uint32_t(d) == 0x80000000u) return Vec4i(x == Vec4i(0x80000000)) & 1; // prevent overflow when changing sign const uint32_t d1 = d > 0 ? uint32_t(d) : uint32_t(-d); // compile-time abs(d). (force GCC compiler to treat d as 32 bits, not 64 bits) if ((d1 & (d1-1)) == 0) { // d1 is a power of 2. use shift const int k = bit_scan_reverse_const(d1); __m128i sign; if (k > 1) sign = _mm_srai_epi32(x, k-1); else sign = x; // k copies of sign bit __m128i bias = _mm_srli_epi32(sign, 32-k); // bias = x >= 0 ? 0 : k-1 __m128i xpbias = _mm_add_epi32 (x, bias); // x + bias __m128i q = _mm_srai_epi32(xpbias, k); // (x + bias) >> k if (d > 0) return q; // d > 0: return q return _mm_sub_epi32(_mm_setzero_si128(), q); // d < 0: return -q } // general case const int32_t sh = bit_scan_reverse_const(uint32_t(d1)-1); // ceil(log2(d1)) - 1. (d1 < 2 handled by power of 2 case) const int32_t mult = int(1 + (uint64_t(1) << (32+sh)) / uint32_t(d1) - (int64_t(1) << 32)); // multiplier const Divisor_i div(mult, sh, d < 0 ? -1 : 0); return x / div; } // define Vec4i a / const_int(d) template static inline Vec4i operator / (Vec4i const & a, Const_int_t) { return divide_by_i(a); } // define Vec4i a / const_uint(d) template static inline Vec4i operator / (Vec4i const & a, Const_uint_t) { Static_error_check< (d<0x80000000u) > Error_overflow_dividing_signed_by_unsigned; // Error: dividing signed by overflowing unsigned return divide_by_i(a); // signed divide } // vector operator /= : divide template static inline Vec4i & operator /= (Vec4i & a, Const_int_t b) { a = a / b; return a; } // vector operator /= : divide template static inline Vec4i & operator /= (Vec4i & a, Const_uint_t b) { a = a / b; return a; } // Divide Vec4ui by compile-time constant template static inline Vec4ui divide_by_ui(Vec4ui const & x) { Static_error_check<(d!=0)> Dividing_by_zero; // Error message if dividing by zero if (d == 1) return x; // divide by 1 const int b = bit_scan_reverse_const(d); // floor(log2(d)) if ((uint32_t(d) & (uint32_t(d)-1)) == 0) { // d is a power of 2. use shift return _mm_srli_epi32(x, b); // x >> b } // general case (d > 2) uint32_t mult = uint32_t((uint64_t(1) << (b+32)) / d); // multiplier = 2^(32+b) / d const uint64_t rem = (uint64_t(1) << (b+32)) - uint64_t(d)*mult; // remainder 2^(32+b) % d const bool round_down = (2*rem < d); // check if fraction is less than 0.5 if (!round_down) { mult = mult + 1; // round up mult } // do 32*32->64 bit unsigned multiplication and get high part of result const __m128i multv = _mm_set_epi32(0,mult,0,mult); // zero-extend mult and broadcast __m128i t1 = _mm_mul_epu32(x,multv); // 32x32->64 bit unsigned multiplication of x[0] and x[2] if (round_down) { t1 = _mm_add_epi64(t1,multv); // compensate for rounding error. (x+1)*m replaced by x*m+m to avoid overflow } __m128i t2 = _mm_srli_epi64(t1,32); // high dword of result 0 and 2 __m128i t3 = _mm_srli_epi64(x,32); // get x[1] and x[3] into position for multiplication __m128i t4 = _mm_mul_epu32(t3,multv); // 32x32->64 bit unsigned multiplication of x[1] and x[3] if (round_down) { t4 = _mm_add_epi64(t4,multv); // compensate for rounding error. (x+1)*m replaced by x*m+m to avoid overflow } __m128i t5 = _mm_set_epi32(-1,0,-1,0); // mask of dword 1 and 3 #if INSTRSET >= 5 // SSE4.1 supported __m128i t7 = _mm_blendv_epi8(t2,t4,t5); // blend two results #else __m128i t6 = _mm_and_si128(t4,t5); // high dword of result 1 and 3 __m128i t7 = _mm_or_si128(t2,t6); // combine all four results into one vector #endif Vec4ui q = _mm_srli_epi32(t7, b); // shift right by b return q; // no overflow possible } // define Vec4ui a / const_uint(d) template static inline Vec4ui operator / (Vec4ui const & a, Const_uint_t) { return divide_by_ui(a); } // define Vec4ui a / const_int(d) template static inline Vec4ui operator / (Vec4ui const & a, Const_int_t) { Static_error_check< (d>=0) > Error_dividing_unsigned_by_negative;// Error: dividing unsigned by negative is ambiguous return divide_by_ui(a); // unsigned divide } // vector operator /= : divide template static inline Vec4ui & operator /= (Vec4ui & a, Const_uint_t b) { a = a / b; return a; } // vector operator /= : divide template static inline Vec4ui & operator /= (Vec4ui & a, Const_int_t b) { a = a / b; return a; } // Divide Vec8s by compile-time constant template static inline Vec8s divide_by_i(Vec8s const & x) { const int16_t d0 = int16_t(d); // truncate d to 16 bits Static_error_check<(d0 != 0)> Dividing_by_zero; // Error message if dividing by zero if (d0 == 1) return x; // divide by 1 if (d0 == -1) return -x; // divide by -1 if (uint16_t(d0) == 0x8000u) return (x == Vec8s(0x8000)) & 1; // prevent overflow when changing sign // if (d > 0x7FFF || d < -0x8000) return 0; // not relevant when d truncated to 16 bits const uint16_t d1 = d0 > 0 ? d0 : -d0; // compile-time abs(d0) if ((d1 & (d1-1)) == 0) { // d is a power of 2. use shift const int k = bit_scan_reverse_const(uint32_t(d1)); __m128i sign; if (k > 1) sign = _mm_srai_epi16(x, k-1); else sign = x; // k copies of sign bit __m128i bias = _mm_srli_epi16(sign, 16-k); // bias = x >= 0 ? 0 : k-1 __m128i xpbias = _mm_add_epi16 (x, bias); // x + bias __m128i q = _mm_srai_epi16(xpbias, k); // (x + bias) >> k if (d0 > 0) return q; // d0 > 0: return q return _mm_sub_epi16(_mm_setzero_si128(), q); // d0 < 0: return -q } // general case const int L = bit_scan_reverse_const(uint16_t(d1-1)) + 1; // ceil(log2(d)). (d < 2 handled above) const int16_t mult = int16_t(1 + (1u << (15+L)) / uint32_t(d1) - 0x10000);// multiplier const int shift1 = L - 1; const Divisor_s div(mult, shift1, d0 > 0 ? 0 : -1); return x / div; } // define Vec8s a / const_int(d) template static inline Vec8s operator / (Vec8s const & a, Const_int_t) { return divide_by_i(a); } // define Vec8s a / const_uint(d) template static inline Vec8s operator / (Vec8s const & a, Const_uint_t) { Static_error_check< (d<0x8000u) > Error_overflow_dividing_signed_by_unsigned; // Error: dividing signed by overflowing unsigned return divide_by_i(a); // signed divide } // vector operator /= : divide template static inline Vec8s & operator /= (Vec8s & a, Const_int_t b) { a = a / b; return a; } // vector operator /= : divide template static inline Vec8s & operator /= (Vec8s & a, Const_uint_t b) { a = a / b; return a; } // Divide Vec8us by compile-time constant template static inline Vec8us divide_by_ui(Vec8us const & x) { const uint16_t d0 = uint16_t(d); // truncate d to 16 bits Static_error_check<(d0 != 0)> Dividing_by_zero; // Error message if dividing by zero if (d0 == 1) return x; // divide by 1 const int b = bit_scan_reverse_const(d0); // floor(log2(d)) if ((d0 & (d0-1)) == 0) { // d is a power of 2. use shift return _mm_srli_epi16(x, b); // x >> b } // general case (d > 2) uint16_t mult = uint16_t((uint32_t(1) << (b+16)) / d0); // multiplier = 2^(32+b) / d const uint32_t rem = (uint32_t(1) << (b+16)) - uint32_t(d0)*mult;// remainder 2^(32+b) % d const bool round_down = (2*rem < d0); // check if fraction is less than 0.5 Vec8us x1 = x; if (round_down) { x1 = x1 + 1; // round down mult and compensate by adding 1 to x } else { mult = mult + 1; // round up mult. no compensation needed } const __m128i multv = _mm_set1_epi16(mult); // broadcast mult __m128i xm = _mm_mulhi_epu16(x1, multv); // high part of 16x16->32 bit unsigned multiplication Vec8us q = _mm_srli_epi16(xm, b); // shift right by b if (round_down) { Vec8s overfl = (x1 == (Vec8us)_mm_setzero_si128()); // check for overflow of x+1 return select(overfl, Vec8us(mult >> b), q); // deal with overflow (rarely needed) } else { return q; // no overflow possible } } // define Vec8us a / const_uint(d) template static inline Vec8us operator / (Vec8us const & a, Const_uint_t) { return divide_by_ui(a); } // define Vec8us a / const_int(d) template static inline Vec8us operator / (Vec8us const & a, Const_int_t) { Static_error_check< (d>=0) > Error_dividing_unsigned_by_negative;// Error: dividing unsigned by negative is ambiguous return divide_by_ui(a); // unsigned divide } // vector operator /= : divide template static inline Vec8us & operator /= (Vec8us & a, Const_uint_t b) { a = a / b; return a; } // vector operator /= : divide template static inline Vec8us & operator /= (Vec8us & a, Const_int_t b) { a = a / b; return a; } // define Vec16c a / const_int(d) template static inline Vec16c operator / (Vec16c const & a, Const_int_t) { // expand into two Vec8s Vec8s low = extend_low(a) / Const_int_t(); Vec8s high = extend_high(a) / Const_int_t(); return compress(low,high); } // define Vec16c a / const_uint(d) template static inline Vec16c operator / (Vec16c const & a, Const_uint_t) { Static_error_check< (uint8_t(d)<0x80u) > Error_overflow_dividing_signed_by_unsigned; // Error: dividing signed by overflowing unsigned return a / Const_int_t(); // signed divide } // vector operator /= : divide template static inline Vec16c & operator /= (Vec16c & a, Const_int_t b) { a = a / b; return a; } // vector operator /= : divide template static inline Vec16c & operator /= (Vec16c & a, Const_uint_t b) { a = a / b; return a; } // define Vec16uc a / const_uint(d) template static inline Vec16uc operator / (Vec16uc const & a, Const_uint_t) { // expand into two Vec8usc Vec8us low = extend_low(a) / Const_uint_t(); Vec8us high = extend_high(a) / Const_uint_t(); return compress(low,high); } // define Vec16uc a / const_int(d) template static inline Vec16uc operator / (Vec16uc const & a, Const_int_t) { Static_error_check< (int8_t(d)>=0) > Error_dividing_unsigned_by_negative;// Error: dividing unsigned by negative is ambiguous return a / Const_uint_t(); // unsigned divide } // vector operator /= : divide template static inline Vec16uc & operator /= (Vec16uc & a, Const_uint_t b) { a = a / b; return a; } // vector operator /= : divide template static inline Vec16uc & operator /= (Vec16uc & a, Const_int_t b) { a = a / b; return a; } /***************************************************************************** * * Horizontal scan functions * *****************************************************************************/ // Get index to the first element that is true. Return -1 if all are false static inline int horizontal_find_first(Vec16cb const & x) { uint32_t a = _mm_movemask_epi8(x); if (a == 0) return -1; int32_t b = bit_scan_forward(a); return b; } static inline int horizontal_find_first(Vec8sb const & x) { return horizontal_find_first(Vec16cb(x)) >> 1; // must use signed shift } static inline int horizontal_find_first(Vec4ib const & x) { return horizontal_find_first(Vec16cb(x)) >> 2; // must use signed shift } static inline int horizontal_find_first(Vec2qb const & x) { return horizontal_find_first(Vec16cb(x)) >> 3; // must use signed shift } // Count the number of elements that are true static inline uint32_t horizontal_count(Vec16cb const & x) { uint32_t a = _mm_movemask_epi8(x); return vml_popcnt(a); } static inline uint32_t horizontal_count(Vec8sb const & x) { return horizontal_count(Vec16cb(x)) >> 1; } static inline uint32_t horizontal_count(Vec4ib const & x) { return horizontal_count(Vec16cb(x)) >> 2; } static inline uint32_t horizontal_count(Vec2qb const & x) { return horizontal_count(Vec16cb(x)) >> 3; } /***************************************************************************** * * Boolean <-> bitfield conversion functions * *****************************************************************************/ // to_bits: convert boolean vector to integer bitfield static inline uint16_t to_bits(Vec16cb const & x) { return (uint16_t)_mm_movemask_epi8(x); } // to_Vec16bc: convert integer bitfield to boolean vector static inline Vec16cb to_Vec16cb(uint16_t x) { static const uint32_t table[16] = { // lookup-table 0x00000000, 0x000000FF, 0x0000FF00, 0x0000FFFF, 0x00FF0000, 0x00FF00FF, 0x00FFFF00, 0x00FFFFFF, 0xFF000000, 0xFF0000FF, 0xFF00FF00, 0xFF00FFFF, 0xFFFF0000, 0xFFFF00FF, 0xFFFFFF00, 0xFFFFFFFF}; uint32_t a0 = table[x & 0xF]; uint32_t a1 = table[(x>>4) & 0xF]; uint32_t a2 = table[(x>>8) & 0xF]; uint32_t a3 = table[(x>>12) & 0xF]; return Vec16cb(Vec16c(Vec4ui(a0, a1, a2, a3))); } // to_bits: convert boolean vector to integer bitfield static inline uint8_t to_bits(Vec8sb const & x) { __m128i a = _mm_packs_epi16(x, x); // 16-bit words to bytes return (uint8_t)_mm_movemask_epi8(a); } // to_Vec8sb: convert integer bitfield to boolean vector static inline Vec8sb to_Vec8sb(uint8_t x) { static const uint32_t table[16] = { // lookup-table 0x00000000, 0x000000FF, 0x0000FF00, 0x0000FFFF, 0x00FF0000, 0x00FF00FF, 0x00FFFF00, 0x00FFFFFF, 0xFF000000, 0xFF0000FF, 0xFF00FF00, 0xFF00FFFF, 0xFFFF0000, 0xFFFF00FF, 0xFFFFFF00, 0xFFFFFFFF}; uint32_t a0 = table[x & 0xF]; uint32_t a1 = table[(x>>4) & 0xF]; Vec4ui b = Vec4ui(a0, a1, a0, a1); return _mm_unpacklo_epi8(b, b); // duplicate bytes to 16-bit words } #if INSTRSET < 9 || MAX_VECTOR_SIZE < 512 // These functions are defined in Vectori512.h if AVX512 instruction set is used // to_bits: convert boolean vector to integer bitfield static inline uint8_t to_bits(Vec4ib const & x) { __m128i a = _mm_packs_epi32(x, x); // 32-bit dwords to 16-bit words __m128i b = _mm_packs_epi16(a, a); // 16-bit words to bytes return _mm_movemask_epi8(b) & 0xF; } // to_Vec4ib: convert integer bitfield to boolean vector static inline Vec4ib to_Vec4ib(uint8_t x) { static const uint32_t table[16] = { // lookup-table 0x00000000, 0x000000FF, 0x0000FF00, 0x0000FFFF, 0x00FF0000, 0x00FF00FF, 0x00FFFF00, 0x00FFFFFF, 0xFF000000, 0xFF0000FF, 0xFF00FF00, 0xFF00FFFF, 0xFFFF0000, 0xFFFF00FF, 0xFFFFFF00, 0xFFFFFFFF}; uint32_t a = table[x & 0xF]; // 4 bytes __m128i b = _mm_cvtsi32_si128(a); // transfer to vector register __m128i c = _mm_unpacklo_epi8(b, b); // duplicate bytes to 16-bit words __m128i d = _mm_unpacklo_epi16(c, c); // duplicate 16-bit words to 32-bit dwords return d; } // to_bits: convert boolean vector to integer bitfield static inline uint8_t to_bits(Vec2qb const & x) { uint32_t a = _mm_movemask_epi8(x); return (a & 1) | ((a >> 7) & 2); } // to_Vec2qb: convert integer bitfield to boolean vector static inline Vec2qb to_Vec2qb(uint8_t x) { return Vec2qb(Vec2q(-(x&1), -((x>>1)&1))); } #else // function prototypes here only // to_bits: convert boolean vector to integer bitfield static inline uint8_t to_bits(Vec4ib x); // to_Vec4ib: convert integer bitfield to boolean vector static inline Vec4ib to_Vec4ib(uint8_t x); // to_bits: convert boolean vector to integer bitfield static inline uint8_t to_bits(Vec2qb x); // to_Vec2qb: convert integer bitfield to boolean vector static inline Vec2qb to_Vec2qb(uint8_t x); #endif // INSTRSET < 9 || MAX_VECTOR_SIZE < 512 #endif // VECTORI128_H