mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-30 11:32:03 +00:00
4333 lines
158 KiB
C++
4333 lines
158 KiB
C++
/**************************** vectori256e.h *******************************
|
|
* Author: Agner Fog
|
|
* Date created: 2012-05-30
|
|
* Last modified: 2014-10-16
|
|
* Version: 1.16
|
|
* Project: vector classes
|
|
* Description:
|
|
* Header file defining 256-bit integer point vector classes as interface
|
|
* to intrinsic functions. Emulated for processors without AVX2 instruction set.
|
|
*
|
|
* The following vector classes are defined here:
|
|
* Vec256b Vector of 256 1-bit unsigned integers or Booleans
|
|
* Vec32c Vector of 32 8-bit signed integers
|
|
* Vec32uc Vector of 32 8-bit unsigned integers
|
|
* Vec32cb Vector of 32 Booleans for use with Vec32c and Vec32uc
|
|
* Vec16s Vector of 16 16-bit signed integers
|
|
* Vec16us Vector of 16 16-bit unsigned integers
|
|
* Vec16sb Vector of 16 Booleans for use with Vec16s and Vec16us
|
|
* Vec8i Vector of 8 32-bit signed integers
|
|
* Vec8ui Vector of 8 32-bit unsigned integers
|
|
* Vec8ib Vector of 8 Booleans for use with Vec8i and Vec8ui
|
|
* Vec4q Vector of 4 64-bit signed integers
|
|
* Vec4uq Vector of 4 64-bit unsigned integers
|
|
* Vec4qb Vector of 4 Booleans for use with Vec4q and Vec4uq
|
|
*
|
|
* For detailed instructions, see VectorClass.pdf
|
|
*
|
|
* (c) Copyright 2012 - 2014 GNU General Public License http://www.gnu.org/licenses
|
|
*****************************************************************************/
|
|
|
|
// check combination of header files
|
|
#if defined (VECTORI256_H)
|
|
#if VECTORI256_H != 1
|
|
#error Two different versions of vectori256.h included
|
|
#endif
|
|
#else
|
|
#define VECTORI256_H 1
|
|
|
|
#ifdef VECTORF256_H
|
|
#error Please put header file vectori256.h or vectori256e.h before vectorf256e.h
|
|
#endif
|
|
|
|
|
|
#include "vectori128.h"
|
|
|
|
|
|
/*****************************************************************************
|
|
*
|
|
* base class Vec256ie
|
|
*
|
|
*****************************************************************************/
|
|
// base class to replace Vec256ie when AVX2 is not supported
|
|
class Vec256ie {
|
|
protected:
|
|
__m128i y0; // low half
|
|
__m128i y1; // high half
|
|
public:
|
|
Vec256ie(void) {}; // default constructor
|
|
Vec256ie(__m128i x0, __m128i x1) { // constructor to build from two __m128i
|
|
y0 = x0; y1 = x1;
|
|
}
|
|
__m128i get_low() const { // get low half
|
|
return y0;
|
|
}
|
|
__m128i get_high() const { // get high half
|
|
return y1;
|
|
}
|
|
};
|
|
|
|
|
|
/*****************************************************************************
|
|
*
|
|
* Vector of 256 1-bit unsigned integers or Booleans
|
|
*
|
|
*****************************************************************************/
|
|
|
|
class Vec256b : public Vec256ie {
|
|
public:
|
|
// Default constructor:
|
|
Vec256b() {
|
|
}
|
|
// Constructor to broadcast the same value into all elements
|
|
// Removed because of undesired implicit conversions
|
|
//Vec256b(int i) {
|
|
// y1 = y0 = _mm_set1_epi32(-(i & 1));}
|
|
|
|
// Constructor to build from two Vec128b:
|
|
Vec256b(Vec128b const & a0, Vec128b const & a1) {
|
|
y0 = a0; y1 = a1;
|
|
}
|
|
// Constructor to convert from type Vec256ie
|
|
Vec256b(Vec256ie const & x) {
|
|
y0 = x.get_low(); y1 = x.get_high();
|
|
}
|
|
// Assignment operator to convert from type Vec256ie
|
|
Vec256b & operator = (Vec256ie const & x) {
|
|
y0 = x.get_low(); y1 = x.get_high();
|
|
return *this;
|
|
}
|
|
// Member function to load from array (unaligned)
|
|
Vec256b & load(void const * p) {
|
|
y0 = _mm_loadu_si128((__m128i const*)p);
|
|
y1 = _mm_loadu_si128((__m128i const*)p + 1);
|
|
return *this;
|
|
}
|
|
// Member function to load from array, aligned by 32
|
|
// You may use load_a instead of load if you are certain that p points to an address
|
|
// divisible by 32, but there is hardly any speed advantage of load_a on modern processors
|
|
Vec256b & load_a(void const * p) {
|
|
y0 = _mm_load_si128((__m128i const*)p);
|
|
y1 = _mm_load_si128((__m128i const*)p + 1);
|
|
return *this;
|
|
}
|
|
// Member function to store into array (unaligned)
|
|
void store(void * p) const {
|
|
_mm_storeu_si128((__m128i*)p, y0);
|
|
_mm_storeu_si128((__m128i*)p + 1, y1);
|
|
}
|
|
// Member function to store into array, aligned by 32
|
|
// You may use store_a instead of store if you are certain that p points to an address
|
|
// divisible by 32, but there is hardly any speed advantage of load_a on modern processors
|
|
void store_a(void * p) const {
|
|
_mm_store_si128((__m128i*)p, y0);
|
|
_mm_store_si128((__m128i*)p + 1, y1);
|
|
}
|
|
// Member function to change a single bit
|
|
// Note: This function is inefficient. Use load function if changing more than one bit
|
|
Vec256b const & set_bit(uint32_t index, int value) {
|
|
if (index < 128) {
|
|
y0 = Vec128b(y0).set_bit(index, value);
|
|
}
|
|
else {
|
|
y1 = Vec128b(y1).set_bit(index-128, value);
|
|
}
|
|
return *this;
|
|
}
|
|
// Member function to get a single bit
|
|
// Note: This function is inefficient. Use store function if reading more than one bit
|
|
int get_bit(uint32_t index) const {
|
|
if (index < 128) {
|
|
return Vec128b(y0).get_bit(index);
|
|
}
|
|
else {
|
|
return Vec128b(y1).get_bit(index-128);
|
|
}
|
|
}
|
|
// Extract a single element. Use store function if extracting more than one element.
|
|
// Operator [] can only read an element, not write.
|
|
bool operator [] (uint32_t index) const {
|
|
return get_bit(index) != 0;
|
|
}
|
|
// Member functions to split into two Vec128b:
|
|
Vec128b get_low() const {
|
|
return y0;
|
|
}
|
|
Vec128b get_high() const {
|
|
return y1;
|
|
}
|
|
static int size () {
|
|
return 256;
|
|
}
|
|
};
|
|
|
|
// Define operators for this class
|
|
|
|
// vector operator & : bitwise and
|
|
static inline Vec256b operator & (Vec256b const & a, Vec256b const & b) {
|
|
return Vec256b(a.get_low() & b.get_low(), a.get_high() & b.get_high());
|
|
}
|
|
static inline Vec256b operator && (Vec256b const & a, Vec256b const & b) {
|
|
return a & b;
|
|
}
|
|
|
|
// vector operator | : bitwise or
|
|
static inline Vec256b operator | (Vec256b const & a, Vec256b const & b) {
|
|
return Vec256b(a.get_low() | b.get_low(), a.get_high() | b.get_high());
|
|
}
|
|
static inline Vec256b operator || (Vec256b const & a, Vec256b const & b) {
|
|
return a | b;
|
|
}
|
|
|
|
// vector operator ^ : bitwise xor
|
|
static inline Vec256b operator ^ (Vec256b const & a, Vec256b const & b) {
|
|
return Vec256b(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());
|
|
}
|
|
|
|
// vector operator ~ : bitwise not
|
|
static inline Vec256b operator ~ (Vec256b const & a) {
|
|
return Vec256b(~a.get_low(), ~a.get_high());
|
|
}
|
|
|
|
// vector operator &= : bitwise and
|
|
static inline Vec256b & operator &= (Vec256b & a, Vec256b const & b) {
|
|
a = a & b;
|
|
return a;
|
|
}
|
|
|
|
// vector operator |= : bitwise or
|
|
static inline Vec256b & operator |= (Vec256b & a, Vec256b const & b) {
|
|
a = a | b;
|
|
return a;
|
|
}
|
|
|
|
// vector operator ^= : bitwise xor
|
|
static inline Vec256b & operator ^= (Vec256b & a, Vec256b const & b) {
|
|
a = a ^ b;
|
|
return a;
|
|
}
|
|
|
|
// Define functions for this class
|
|
|
|
// function andnot: a & ~ b
|
|
static inline Vec256b andnot (Vec256b const & a, Vec256b const & b) {
|
|
return Vec256b(andnot(a.get_low(), b.get_low()), andnot(a.get_high(), b.get_high()));
|
|
}
|
|
|
|
|
|
/*****************************************************************************
|
|
*
|
|
* Generate compile-time constant vector
|
|
*
|
|
*****************************************************************************/
|
|
// Generate a constant vector of 8 integers stored in memory.
|
|
// Can be converted to any integer vector type
|
|
template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
|
|
static inline Vec256ie constant8i() {
|
|
static const union {
|
|
int32_t i[8];
|
|
__m128i y[2];
|
|
} u = {{i0,i1,i2,i3,i4,i5,i6,i7}};
|
|
return Vec256ie(u.y[0], u.y[1]);
|
|
}
|
|
|
|
|
|
/*****************************************************************************
|
|
*
|
|
* selectb function
|
|
*
|
|
*****************************************************************************/
|
|
// Select between two sources, byte by byte. Used in various functions and operators
|
|
// Corresponds to this pseudocode:
|
|
// for (int i = 0; i < 32; i++) result[i] = s[i] ? a[i] : b[i];
|
|
// Each byte in s must be either 0 (false) or 0xFF (true). No other values are allowed.
|
|
// Only bit 7 in each byte of s is checked,
|
|
static inline Vec256ie selectb (Vec256ie const & s, Vec256ie const & a, Vec256ie const & b) {
|
|
return Vec256ie(selectb(s.get_low(), a.get_low(), b.get_low()),
|
|
selectb(s.get_high(), a.get_high(), b.get_high()));
|
|
}
|
|
|
|
|
|
|
|
/*****************************************************************************
|
|
*
|
|
* Horizontal Boolean functions
|
|
*
|
|
*****************************************************************************/
|
|
|
|
// horizontal_and. Returns true if all bits are 1
|
|
static inline bool horizontal_and (Vec256b const & a) {
|
|
return horizontal_and(a.get_low() & a.get_high());
|
|
}
|
|
|
|
// horizontal_or. Returns true if at least one bit is 1
|
|
static inline bool horizontal_or (Vec256b const & a) {
|
|
return horizontal_or(a.get_low() | a.get_high());
|
|
}
|
|
|
|
|
|
/*****************************************************************************
|
|
*
|
|
* Vector of 32 8-bit signed integers
|
|
*
|
|
*****************************************************************************/
|
|
|
|
class Vec32c : public Vec256b {
|
|
public:
|
|
// Default constructor:
|
|
Vec32c(){
|
|
};
|
|
// Constructor to broadcast the same value into all elements:
|
|
Vec32c(int i) {
|
|
y1 = y0 = _mm_set1_epi8((char)i);
|
|
};
|
|
// Constructor to build from all elements:
|
|
Vec32c(int8_t i0, int8_t i1, int8_t i2, int8_t i3, int8_t i4, int8_t i5, int8_t i6, int8_t i7,
|
|
int8_t i8, int8_t i9, int8_t i10, int8_t i11, int8_t i12, int8_t i13, int8_t i14, int8_t i15,
|
|
int8_t i16, int8_t i17, int8_t i18, int8_t i19, int8_t i20, int8_t i21, int8_t i22, int8_t i23,
|
|
int8_t i24, int8_t i25, int8_t i26, int8_t i27, int8_t i28, int8_t i29, int8_t i30, int8_t i31) {
|
|
y0 = _mm_setr_epi8(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15);
|
|
y1 = _mm_setr_epi8(i16, i17, i18, i19, i20, i21, i22, i23, i24, i25, i26, i27, i28, i29, i30, i31);
|
|
};
|
|
// Constructor to build from two Vec16c:
|
|
Vec32c(Vec16c const & a0, Vec16c const & a1) {
|
|
y0 = a0; y1 = a1;
|
|
}
|
|
// Constructor to convert from type Vec256ie
|
|
Vec32c(Vec256ie const & x) {
|
|
y0 = x.get_low();
|
|
y1 = x.get_high();
|
|
};
|
|
// Assignment operator to convert from type Vec256ie
|
|
Vec32c & operator = (Vec256ie const & x) {
|
|
y0 = x.get_low();
|
|
y1 = x.get_high();
|
|
return *this;
|
|
};
|
|
// Member function to load from array (unaligned)
|
|
Vec32c & load(void const * p) {
|
|
y0 = _mm_loadu_si128((__m128i const*)p);
|
|
y1 = _mm_loadu_si128((__m128i const*)p + 1);
|
|
return *this;
|
|
}
|
|
// Member function to load from array, aligned by 32
|
|
Vec32c & load_a(void const * p) {
|
|
y0 = _mm_load_si128((__m128i const*)p);
|
|
y1 = _mm_load_si128((__m128i const*)p + 1);
|
|
return *this;
|
|
}
|
|
// Partial load. Load n elements and set the rest to 0
|
|
Vec32c & load_partial(int n, void const * p) {
|
|
if (n <= 0) {
|
|
*this = 0;
|
|
}
|
|
else if (n <= 16) {
|
|
*this = Vec32c(Vec16c().load_partial(n, p), 0);
|
|
}
|
|
else if (n < 32) {
|
|
*this = Vec32c(Vec16c().load(p), Vec16c().load_partial(n-16, (char*)p+16));
|
|
}
|
|
else {
|
|
load(p);
|
|
}
|
|
return *this;
|
|
}
|
|
// Partial store. Store n elements
|
|
void store_partial(int n, void * p) const {
|
|
if (n <= 0) {
|
|
return;
|
|
}
|
|
else if (n <= 16) {
|
|
get_low().store_partial(n, p);
|
|
}
|
|
else if (n < 32) {
|
|
get_low().store(p);
|
|
get_high().store_partial(n-16, (char*)p+16);
|
|
}
|
|
else {
|
|
store(p);
|
|
}
|
|
}
|
|
// cut off vector to n elements. The last 32-n elements are set to zero
|
|
Vec32c & cutoff(int n) {
|
|
if (uint32_t(n) >= 32) return *this;
|
|
static const union {
|
|
int32_t i[16];
|
|
char c[64];
|
|
} mask = {{-1,-1,-1,-1,-1,-1,-1,-1,0,0,0,0,0,0,0,0}};
|
|
*this &= Vec32c().load(mask.c+32-n);
|
|
return *this;
|
|
}
|
|
// Member function to change a single element in vector
|
|
// Note: This function is inefficient. Use load function if changing more than one element
|
|
Vec32c const & insert(uint32_t index, int8_t value) {
|
|
if (index < 16) {
|
|
y0 = Vec16c(y0).insert(index, value);
|
|
}
|
|
else {
|
|
y1 = Vec16c(y1).insert(index-16, value);
|
|
}
|
|
return *this;
|
|
}
|
|
// Member function extract a single element from vector
|
|
int8_t extract(uint32_t index) const {
|
|
if (index < 16) {
|
|
return Vec16c(y0).extract(index);
|
|
}
|
|
else {
|
|
return Vec16c(y1).extract(index-16);
|
|
}
|
|
}
|
|
// Extract a single element. Use store function if extracting more than one element.
|
|
// Operator [] can only read an element, not write.
|
|
int8_t operator [] (uint32_t index) const {
|
|
return extract(index);
|
|
}
|
|
// Member functions to split into two Vec16c:
|
|
Vec16c get_low() const {
|
|
return y0;
|
|
}
|
|
Vec16c get_high() const {
|
|
return y1;
|
|
}
|
|
static int size () {
|
|
return 32;
|
|
}
|
|
};
|
|
|
|
|
|
/*****************************************************************************
|
|
*
|
|
* Vec32cb: Vector of 32 Booleans for use with Vec32c and Vec32uc
|
|
*
|
|
*****************************************************************************/
|
|
|
|
class Vec32cb : public Vec32c {
|
|
public:
|
|
// Default constructor:
|
|
Vec32cb(){}
|
|
// Constructor to build from all elements:
|
|
Vec32cb(bool x0, bool x1, bool x2, bool x3, bool x4, bool x5, bool x6, bool x7,
|
|
bool x8, bool x9, bool x10, bool x11, bool x12, bool x13, bool x14, bool x15,
|
|
bool x16, bool x17, bool x18, bool x19, bool x20, bool x21, bool x22, bool x23,
|
|
bool x24, bool x25, bool x26, bool x27, bool x28, bool x29, bool x30, bool x31) :
|
|
Vec32c(-int8_t(x0), -int8_t(x1), -int8_t(x2), -int8_t(x3), -int8_t(x4), -int8_t(x5), -int8_t(x6), -int8_t(x7),
|
|
-int8_t(x8), -int8_t(x9), -int8_t(x10), -int8_t(x11), -int8_t(x12), -int8_t(x13), -int8_t(x14), -int8_t(x15),
|
|
-int8_t(x16), -int8_t(x17), -int8_t(x18), -int8_t(x19), -int8_t(x20), -int8_t(x21), -int8_t(x22), -int8_t(x23),
|
|
-int8_t(x24), -int8_t(x25), -int8_t(x26), -int8_t(x27), -int8_t(x28), -int8_t(x29), -int8_t(x30), -int8_t(x31))
|
|
{}
|
|
// Constructor to convert from type Vec256ie
|
|
Vec32cb(Vec256ie const & x) {
|
|
y0 = x.get_low();
|
|
y1 = x.get_high();
|
|
}
|
|
// Assignment operator to convert from type Vec256ie
|
|
Vec32cb & operator = (Vec256ie const & x) {
|
|
y0 = x.get_low();
|
|
y1 = x.get_high();
|
|
return *this;
|
|
}
|
|
// Constructor to broadcast scalar value:
|
|
Vec32cb(bool b) : Vec32c(-int8_t(b)) {
|
|
}
|
|
// Assignment operator to broadcast scalar value:
|
|
Vec32cb & operator = (bool b) {
|
|
*this = Vec32cb(b);
|
|
return *this;
|
|
}
|
|
private: // Prevent constructing from int, etc.
|
|
Vec32cb(int b);
|
|
Vec32cb & operator = (int x);
|
|
public:
|
|
// Member functions to split into two Vec16c:
|
|
Vec16cb get_low() const {
|
|
return y0;
|
|
}
|
|
Vec16cb get_high() const {
|
|
return y1;
|
|
}
|
|
Vec32cb & insert (int index, bool a) {
|
|
Vec32c::insert(index, -(int)a);
|
|
return *this;
|
|
}
|
|
// Member function extract a single element from vector
|
|
bool extract(uint32_t index) const {
|
|
return Vec32c::extract(index) != 0;
|
|
}
|
|
// Extract a single element. Use store function if extracting more than one element.
|
|
// Operator [] can only read an element, not write.
|
|
bool operator [] (uint32_t index) const {
|
|
return extract(index);
|
|
}
|
|
};
|
|
|
|
|
|
/*****************************************************************************
|
|
*
|
|
* Define operators for Vec32cb
|
|
*
|
|
*****************************************************************************/
|
|
|
|
// vector operator & : bitwise and
|
|
static inline Vec32cb operator & (Vec32cb const & a, Vec32cb const & b) {
|
|
return Vec32cb(Vec256b(a) & Vec256b(b));
|
|
}
|
|
static inline Vec32cb operator && (Vec32cb const & a, Vec32cb const & b) {
|
|
return a & b;
|
|
}
|
|
// vector operator &= : bitwise and
|
|
static inline Vec32cb & operator &= (Vec32cb & a, Vec32cb const & b) {
|
|
a = a & b;
|
|
return a;
|
|
}
|
|
|
|
// vector operator | : bitwise or
|
|
static inline Vec32cb operator | (Vec32cb const & a, Vec32cb const & b) {
|
|
return Vec32cb(Vec256b(a) | Vec256b(b));
|
|
}
|
|
static inline Vec32cb operator || (Vec32cb const & a, Vec32cb const & b) {
|
|
return a | b;
|
|
}
|
|
// vector operator |= : bitwise or
|
|
static inline Vec32cb & operator |= (Vec32cb & a, Vec32cb const & b) {
|
|
a = a | b;
|
|
return a;
|
|
}
|
|
|
|
// vector operator ^ : bitwise xor
|
|
static inline Vec32cb operator ^ (Vec32cb const & a, Vec32cb const & b) {
|
|
return Vec32cb(Vec256b(a) ^ Vec256b(b));
|
|
}
|
|
// vector operator ^= : bitwise xor
|
|
static inline Vec32cb & operator ^= (Vec32cb & a, Vec32cb const & b) {
|
|
a = a ^ b;
|
|
return a;
|
|
}
|
|
|
|
// vector operator ~ : bitwise not
|
|
static inline Vec32cb operator ~ (Vec32cb const & a) {
|
|
return Vec32cb( ~ Vec256b(a));
|
|
}
|
|
|
|
// vector operator ! : element not
|
|
static inline Vec32cb operator ! (Vec32cb const & a) {
|
|
return ~ a;
|
|
}
|
|
|
|
// vector function andnot
|
|
static inline Vec32cb andnot (Vec32cb const & a, Vec32cb const & b) {
|
|
return Vec32cb(andnot(Vec256b(a), Vec256b(b)));
|
|
}
|
|
|
|
|
|
/*****************************************************************************
|
|
*
|
|
* Operators for Vec32c
|
|
*
|
|
*****************************************************************************/
|
|
|
|
// vector operator + : add element by element
|
|
static inline Vec32c operator + (Vec32c const & a, Vec32c const & b) {
|
|
return Vec32c(a.get_low() + b.get_low(), a.get_high() + b.get_high());
|
|
}
|
|
|
|
// vector operator += : add
|
|
static inline Vec32c & operator += (Vec32c & a, Vec32c const & b) {
|
|
a = a + b;
|
|
return a;
|
|
}
|
|
|
|
// postfix operator ++
|
|
static inline Vec32c operator ++ (Vec32c & a, int) {
|
|
Vec32c a0 = a;
|
|
a = a + 1;
|
|
return a0;
|
|
}
|
|
|
|
// prefix operator ++
|
|
static inline Vec32c & operator ++ (Vec32c & a) {
|
|
a = a + 1;
|
|
return a;
|
|
}
|
|
|
|
// vector operator - : subtract element by element
|
|
static inline Vec32c operator - (Vec32c const & a, Vec32c const & b) {
|
|
return Vec32c(a.get_low() - b.get_low(), a.get_high() - b.get_high());
|
|
}
|
|
|
|
// vector operator - : unary minus
|
|
static inline Vec32c operator - (Vec32c const & a) {
|
|
return Vec32c(-a.get_low(), -a.get_high());
|
|
}
|
|
|
|
// vector operator -= : add
|
|
static inline Vec32c & operator -= (Vec32c & a, Vec32c const & b) {
|
|
a = a - b;
|
|
return a;
|
|
}
|
|
|
|
// postfix operator --
|
|
static inline Vec32c operator -- (Vec32c & a, int) {
|
|
Vec32c a0 = a;
|
|
a = a - 1;
|
|
return a0;
|
|
}
|
|
|
|
// prefix operator --
|
|
static inline Vec32c & operator -- (Vec32c & a) {
|
|
a = a - 1;
|
|
return a;
|
|
}
|
|
|
|
// vector operator * : multiply element by element
|
|
static inline Vec32c operator * (Vec32c const & a, Vec32c const & b) {
|
|
return Vec32c(a.get_low() * b.get_low(), a.get_high() * b.get_high());
|
|
}
|
|
|
|
// vector operator *= : multiply
|
|
static inline Vec32c & operator *= (Vec32c & a, Vec32c const & b) {
|
|
a = a * b;
|
|
return a;
|
|
}
|
|
|
|
// vector of 32 8-bit signed integers
|
|
static inline Vec32c operator / (Vec32c const & a, Divisor_s const & d) {
|
|
return Vec32c(a.get_low() / d, a.get_high() / d);
|
|
}
|
|
|
|
// vector operator /= : divide
|
|
static inline Vec32c & operator /= (Vec32c & a, Divisor_s const & d) {
|
|
a = a / d;
|
|
return a;
|
|
}
|
|
|
|
// vector operator << : shift left all elements
|
|
static inline Vec32c operator << (Vec32c const & a, int b) {
|
|
return Vec32c(a.get_low() << b, a.get_high() << b);
|
|
}
|
|
|
|
// vector operator <<= : shift left
|
|
static inline Vec32c & operator <<= (Vec32c & a, int b) {
|
|
a = a << b;
|
|
return a;
|
|
}
|
|
|
|
// vector operator >> : shift right arithmetic all elements
|
|
static inline Vec32c operator >> (Vec32c const & a, int b) {
|
|
return Vec32c(a.get_low() >> b, a.get_high() >> b);
|
|
}
|
|
|
|
// vector operator >>= : shift right artihmetic
|
|
static inline Vec32c & operator >>= (Vec32c & a, int b) {
|
|
a = a >> b;
|
|
return a;
|
|
}
|
|
|
|
// vector operator == : returns true for elements for which a == b
|
|
static inline Vec32cb operator == (Vec32c const & a, Vec32c const & b) {
|
|
return Vec32c(a.get_low() == b.get_low(), a.get_high() == b.get_high());
|
|
}
|
|
|
|
// vector operator != : returns true for elements for which a != b
|
|
static inline Vec32cb operator != (Vec32c const & a, Vec32c const & b) {
|
|
return Vec32c(a.get_low() != b.get_low(), a.get_high() != b.get_high());
|
|
}
|
|
|
|
// vector operator > : returns true for elements for which a > b (signed)
|
|
static inline Vec32cb operator > (Vec32c const & a, Vec32c const & b) {
|
|
return Vec32c(a.get_low() > b.get_low(), a.get_high() > b.get_high());
|
|
}
|
|
|
|
// vector operator < : returns true for elements for which a < b (signed)
|
|
static inline Vec32cb operator < (Vec32c const & a, Vec32c const & b) {
|
|
return b > a;
|
|
}
|
|
|
|
// vector operator >= : returns true for elements for which a >= b (signed)
|
|
static inline Vec32cb operator >= (Vec32c const & a, Vec32c const & b) {
|
|
return Vec32c(a.get_low() >= b.get_low(), a.get_high() >= b.get_high());
|
|
}
|
|
|
|
// vector operator <= : returns true for elements for which a <= b (signed)
|
|
static inline Vec32cb operator <= (Vec32c const & a, Vec32c const & b) {
|
|
return b >= a;
|
|
}
|
|
|
|
// vector operator & : bitwise and
|
|
static inline Vec32c operator & (Vec32c const & a, Vec32c const & b) {
|
|
return Vec32c(a.get_low() & b.get_low(), a.get_high() & b.get_high());
|
|
}
|
|
static inline Vec32c operator && (Vec32c const & a, Vec32c const & b) {
|
|
return a & b;
|
|
}
|
|
// vector operator &= : bitwise and
|
|
static inline Vec32c & operator &= (Vec32c & a, Vec32c const & b) {
|
|
a = a & b;
|
|
return a;
|
|
}
|
|
|
|
// vector operator | : bitwise or
|
|
static inline Vec32c operator | (Vec32c const & a, Vec32c const & b) {
|
|
return Vec32c(a.get_low() | b.get_low(), a.get_high() | b.get_high());
|
|
}
|
|
static inline Vec32c operator || (Vec32c const & a, Vec32c const & b) {
|
|
return a | b;
|
|
}
|
|
// vector operator |= : bitwise or
|
|
static inline Vec32c & operator |= (Vec32c & a, Vec32c const & b) {
|
|
a = a | b;
|
|
return a;
|
|
}
|
|
|
|
// vector operator ^ : bitwise xor
|
|
static inline Vec32c operator ^ (Vec32c const & a, Vec32c const & b) {
|
|
return Vec32c(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());
|
|
}
|
|
// vector operator ^= : bitwise xor
|
|
static inline Vec32c & operator ^= (Vec32c & a, Vec32c const & b) {
|
|
a = a ^ b;
|
|
return a;
|
|
}
|
|
|
|
// vector operator ~ : bitwise not
|
|
static inline Vec32c operator ~ (Vec32c const & a) {
|
|
return Vec32c(~a.get_low(), ~a.get_high());
|
|
}
|
|
|
|
// vector operator ! : logical not, returns true for elements == 0
|
|
static inline Vec32cb operator ! (Vec32c const & a) {
|
|
return Vec32c(!a.get_low(), !a.get_high());
|
|
}
|
|
|
|
// Functions for this class
|
|
|
|
// Select between two operands. Corresponds to this pseudocode:
|
|
// for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i];
|
|
// Each byte in s must be either 0 (false) or -1 (true). No other values are allowed.
|
|
static inline Vec32c select (Vec32cb const & s, Vec32c const & a, Vec32c const & b) {
|
|
return selectb(s,a,b);
|
|
}
|
|
|
|
// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
|
|
static inline Vec32c if_add (Vec32cb const & f, Vec32c const & a, Vec32c const & b) {
|
|
return a + (Vec32c(f) & b);
|
|
}
|
|
|
|
// Horizontal add: Calculates the sum of all vector elements.
|
|
// Overflow will wrap around
|
|
static inline uint32_t horizontal_add (Vec32c const & a) {
|
|
return horizontal_add(a.get_low() + a.get_high());
|
|
}
|
|
|
|
// Horizontal add extended: Calculates the sum of all vector elements.
|
|
// Each element is sign-extended before addition to avoid overflow
|
|
static inline int32_t horizontal_add_x (Vec32c const & a) {
|
|
return horizontal_add_x(a.get_low()) + horizontal_add_x(a.get_high());
|
|
}
|
|
|
|
|
|
// function add_saturated: add element by element, signed with saturation
|
|
static inline Vec32c add_saturated(Vec32c const & a, Vec32c const & b) {
|
|
return Vec32c(add_saturated(a.get_low(),b.get_low()), add_saturated(a.get_high(),b.get_high()));
|
|
}
|
|
|
|
// function sub_saturated: subtract element by element, signed with saturation
|
|
static inline Vec32c sub_saturated(Vec32c const & a, Vec32c const & b) {
|
|
return Vec32c(sub_saturated(a.get_low(),b.get_low()), sub_saturated(a.get_high(),b.get_high()));
|
|
}
|
|
|
|
// function max: a > b ? a : b
|
|
static inline Vec32c max(Vec32c const & a, Vec32c const & b) {
|
|
return Vec32c(max(a.get_low(),b.get_low()), max(a.get_high(),b.get_high()));
|
|
}
|
|
|
|
// function min: a < b ? a : b
|
|
static inline Vec32c min(Vec32c const & a, Vec32c const & b) {
|
|
return Vec32c(min(a.get_low(),b.get_low()), min(a.get_high(),b.get_high()));
|
|
}
|
|
|
|
// function abs: a >= 0 ? a : -a
|
|
static inline Vec32c abs(Vec32c const & a) {
|
|
return Vec32c(abs(a.get_low()), abs(a.get_high()));
|
|
}
|
|
|
|
// function abs_saturated: same as abs, saturate if overflow
|
|
static inline Vec32c abs_saturated(Vec32c const & a) {
|
|
return Vec32c(abs_saturated(a.get_low()), abs_saturated(a.get_high()));
|
|
}
|
|
|
|
// function rotate_left all elements
|
|
// Use negative count to rotate right
|
|
static inline Vec32c rotate_left(Vec32c const & a, int b) {
|
|
return Vec32c(rotate_left(a.get_low(),b), rotate_left(a.get_high(),b));
|
|
}
|
|
|
|
|
|
/*****************************************************************************
|
|
*
|
|
* Vector of 16 8-bit unsigned integers
|
|
*
|
|
*****************************************************************************/
|
|
|
|
class Vec32uc : public Vec32c {
|
|
public:
|
|
// Default constructor:
|
|
Vec32uc(){
|
|
};
|
|
// Constructor to broadcast the same value into all elements:
|
|
Vec32uc(uint32_t i) {
|
|
y1 = y0 = _mm_set1_epi8((char)i);
|
|
};
|
|
// Constructor to build from all elements:
|
|
Vec32uc(uint8_t i0, uint8_t i1, uint8_t i2, uint8_t i3, uint8_t i4, uint8_t i5, uint8_t i6, uint8_t i7,
|
|
uint8_t i8, uint8_t i9, uint8_t i10, uint8_t i11, uint8_t i12, uint8_t i13, uint8_t i14, uint8_t i15,
|
|
uint8_t i16, uint8_t i17, uint8_t i18, uint8_t i19, uint8_t i20, uint8_t i21, uint8_t i22, uint8_t i23,
|
|
uint8_t i24, uint8_t i25, uint8_t i26, uint8_t i27, uint8_t i28, uint8_t i29, uint8_t i30, uint8_t i31) {
|
|
y0 = _mm_setr_epi8(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15);
|
|
y1 = _mm_setr_epi8(i16, i17, i18, i19, i20, i21, i22, i23, i24, i25, i26, i27, i28, i29, i30, i31);
|
|
};
|
|
// Constructor to build from two Vec16uc:
|
|
Vec32uc(Vec16uc const & a0, Vec16uc const & a1) {
|
|
y0 = a0; y1 = a1;
|
|
}
|
|
// Constructor to convert from type Vec256ie
|
|
Vec32uc(Vec256ie const & x) {
|
|
y0 = x.get_low(); y1 = x.get_high();
|
|
};
|
|
// Assignment operator to convert from type Vec256ie
|
|
Vec32uc & operator = (Vec256ie const & x) {
|
|
y0 = x.get_low(); y1 = x.get_high();
|
|
return *this;
|
|
};
|
|
// Member function to load from array (unaligned)
|
|
Vec32uc & load(void const * p) {
|
|
y0 = _mm_loadu_si128((__m128i const*)p);
|
|
y1 = _mm_loadu_si128((__m128i const*)p + 1);
|
|
return *this;
|
|
}
|
|
// Member function to load from array, aligned by 32
|
|
Vec32uc & load_a(void const * p) {
|
|
y0 = _mm_load_si128((__m128i const*)p);
|
|
y1 = _mm_load_si128((__m128i const*)p + 1);
|
|
return *this;
|
|
}
|
|
// Member function to change a single element in vector
|
|
// Note: This function is inefficient. Use load function if changing more than one element
|
|
Vec32uc const & insert(uint32_t index, uint8_t value) {
|
|
Vec32c::insert(index, value);
|
|
return *this;
|
|
}
|
|
// Member function extract a single element from vector
|
|
uint8_t extract(uint32_t index) const {
|
|
return Vec32c::extract(index);
|
|
}
|
|
// Extract a single element. Use store function if extracting more than one element.
|
|
// Operator [] can only read an element, not write.
|
|
uint8_t operator [] (uint32_t index) const {
|
|
return extract(index);
|
|
}
|
|
// Member functions to split into two Vec16uc:
|
|
Vec16uc get_low() const {
|
|
return y0;
|
|
}
|
|
Vec16uc get_high() const {
|
|
return y1;
|
|
}
|
|
};
|
|
|
|
// Define operators for this class
|
|
|
|
// vector operator + : add
|
|
static inline Vec32uc operator + (Vec32uc const & a, Vec32uc const & b) {
|
|
return Vec32uc(a.get_low() + b.get_low(), a.get_high() + b.get_high());
|
|
}
|
|
|
|
// vector operator - : subtract
|
|
static inline Vec32uc operator - (Vec32uc const & a, Vec32uc const & b) {
|
|
return Vec32uc(a.get_low() - b.get_low(), a.get_high() - b.get_high());
|
|
}
|
|
|
|
// vector operator * : multiply
|
|
static inline Vec32uc operator * (Vec32uc const & a, Vec32uc const & b) {
|
|
return Vec32uc(a.get_low() * b.get_low(), a.get_high() * b.get_high());
|
|
}
|
|
|
|
// vector operator / : divide
|
|
static inline Vec32uc operator / (Vec32uc const & a, Divisor_us const & d) {
|
|
return Vec32uc(a.get_low() / d, a.get_high() / d);
|
|
}
|
|
|
|
// vector operator /= : divide
|
|
static inline Vec32uc & operator /= (Vec32uc & a, Divisor_us const & d) {
|
|
a = a / d;
|
|
return a;
|
|
}
|
|
|
|
// vector operator << : shift left all elements
|
|
static inline Vec32uc operator << (Vec32uc const & a, uint32_t b) {
|
|
return Vec32uc(a.get_low() << b, a.get_high() << b);
|
|
}
|
|
|
|
// vector operator << : shift left all elements
|
|
static inline Vec32uc operator << (Vec32uc const & a, int32_t b) {
|
|
return a << (uint32_t)b;
|
|
}
|
|
|
|
// vector operator >> : shift right logical all elements
|
|
static inline Vec32uc operator >> (Vec32uc const & a, uint32_t b) {
|
|
return Vec32uc(a.get_low() >> b, a.get_high() >> b);
|
|
}
|
|
|
|
// vector operator >> : shift right logical all elements
|
|
static inline Vec32uc operator >> (Vec32uc const & a, int32_t b) {
|
|
return a >> (uint32_t)b;
|
|
}
|
|
|
|
// vector operator >>= : shift right artihmetic
|
|
static inline Vec32uc & operator >>= (Vec32uc & a, uint32_t b) {
|
|
a = a >> b;
|
|
return a;
|
|
}
|
|
|
|
// vector operator >= : returns true for elements for which a >= b (unsigned)
|
|
static inline Vec32cb operator >= (Vec32uc const & a, Vec32uc const & b) {
|
|
return Vec32c(a.get_low() >= b.get_low(), a.get_high() >= b.get_high());
|
|
}
|
|
|
|
// vector operator <= : returns true for elements for which a <= b (unsigned)
|
|
static inline Vec32cb operator <= (Vec32uc const & a, Vec32uc const & b) {
|
|
return b >= a;
|
|
}
|
|
|
|
// vector operator > : returns true for elements for which a > b (unsigned)
|
|
static inline Vec32cb operator > (Vec32uc const & a, Vec32uc const & b) {
|
|
return Vec32c(a.get_low() > b.get_low(), a.get_high() > b.get_high());
|
|
}
|
|
|
|
// vector operator < : returns true for elements for which a < b (unsigned)
|
|
static inline Vec32cb operator < (Vec32uc const & a, Vec32uc const & b) {
|
|
return b > a;
|
|
}
|
|
|
|
// vector operator & : bitwise and
|
|
static inline Vec32uc operator & (Vec32uc const & a, Vec32uc const & b) {
|
|
return Vec32uc(a.get_low() & b.get_low(), a.get_high() & b.get_high());
|
|
}
|
|
static inline Vec32uc operator && (Vec32uc const & a, Vec32uc const & b) {
|
|
return a & b;
|
|
}
|
|
|
|
// vector operator | : bitwise or
|
|
static inline Vec32uc operator | (Vec32uc const & a, Vec32uc const & b) {
|
|
return Vec32uc(a.get_low() | b.get_low(), a.get_high() | b.get_high());
|
|
}
|
|
static inline Vec32uc operator || (Vec32uc const & a, Vec32uc const & b) {
|
|
return a | b;
|
|
}
|
|
|
|
// vector operator ^ : bitwise xor
|
|
static inline Vec32uc operator ^ (Vec32uc const & a, Vec32uc const & b) {
|
|
return Vec32uc(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());
|
|
}
|
|
|
|
// vector operator ~ : bitwise not
|
|
static inline Vec32uc operator ~ (Vec32uc const & a) {
|
|
return Vec32uc(~a.get_low(), ~a.get_high());
|
|
}
|
|
|
|
// Functions for this class
|
|
|
|
// Select between two operands. Corresponds to this pseudocode:
|
|
// for (int i = 0; i < 32; i++) result[i] = s[i] ? a[i] : b[i];
|
|
// Each byte in s must be either 0 (false) or -1 (true). No other values are allowed.
|
|
// (s is signed)
|
|
static inline Vec32uc select (Vec32cb const & s, Vec32uc const & a, Vec32uc const & b) {
|
|
return selectb(s,a,b);
|
|
}
|
|
|
|
// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
|
|
static inline Vec32uc if_add (Vec32cb const & f, Vec32uc const & a, Vec32uc const & b) {
|
|
return a + (Vec32uc(f) & b);
|
|
}
|
|
|
|
// Horizontal add: Calculates the sum of all vector elements.
|
|
// Overflow will wrap around
|
|
// (Note: horizontal_add_x(Vec32uc) is slightly faster)
|
|
static inline uint32_t horizontal_add (Vec32uc const & a) {
|
|
return horizontal_add(a.get_low() + a.get_high());
|
|
}
|
|
|
|
// Horizontal add extended: Calculates the sum of all vector elements.
|
|
// Each element is zero-extended before addition to avoid overflow
|
|
static inline uint32_t horizontal_add_x (Vec32uc const & a) {
|
|
return horizontal_add_x(a.get_low()) + horizontal_add_x(a.get_high());
|
|
}
|
|
|
|
// function add_saturated: add element by element, unsigned with saturation
|
|
static inline Vec32uc add_saturated(Vec32uc const & a, Vec32uc const & b) {
|
|
return Vec32uc(add_saturated(a.get_low(),b.get_low()), add_saturated(a.get_high(),b.get_high()));
|
|
}
|
|
|
|
// function sub_saturated: subtract element by element, unsigned with saturation
|
|
static inline Vec32uc sub_saturated(Vec32uc const & a, Vec32uc const & b) {
|
|
return Vec32uc(sub_saturated(a.get_low(),b.get_low()), sub_saturated(a.get_high(),b.get_high()));
|
|
}
|
|
|
|
// function max: a > b ? a : b
|
|
static inline Vec32uc max(Vec32uc const & a, Vec32uc const & b) {
|
|
return Vec32uc(max(a.get_low(),b.get_low()), max(a.get_high(),b.get_high()));
|
|
}
|
|
|
|
// function min: a < b ? a : b
|
|
static inline Vec32uc min(Vec32uc const & a, Vec32uc const & b) {
|
|
return Vec32uc(min(a.get_low(),b.get_low()), min(a.get_high(),b.get_high()));
|
|
}
|
|
|
|
|
|
|
|
/*****************************************************************************
|
|
*
|
|
* Vector of 16 16-bit signed integers
|
|
*
|
|
*****************************************************************************/
|
|
|
|
class Vec16s : public Vec256b {
|
|
public:
|
|
// Default constructor:
|
|
Vec16s() {
|
|
};
|
|
// Constructor to broadcast the same value into all elements:
|
|
Vec16s(int i) {
|
|
y1 = y0 = _mm_set1_epi16((int16_t)i);
|
|
};
|
|
// Constructor to build from all elements:
|
|
Vec16s(int16_t i0, int16_t i1, int16_t i2, int16_t i3, int16_t i4, int16_t i5, int16_t i6, int16_t i7,
|
|
int16_t i8, int16_t i9, int16_t i10, int16_t i11, int16_t i12, int16_t i13, int16_t i14, int16_t i15) {
|
|
y0 = _mm_setr_epi16(i0, i1, i2, i3, i4, i5, i6, i7);
|
|
y1 = _mm_setr_epi16(i8, i9, i10, i11, i12, i13, i14, i15);
|
|
};
|
|
// Constructor to build from two Vec8s:
|
|
Vec16s(Vec8s const & a0, Vec8s const & a1) {
|
|
y0 = a0; y1 = a1;
|
|
}
|
|
// Constructor to convert from type Vec256ie
|
|
Vec16s(Vec256ie const & x) {
|
|
y0 = x.get_low(); y1 = x.get_high();
|
|
};
|
|
// Assignment operator to convert from type Vec256ie
|
|
Vec16s & operator = (Vec256ie const & x) {
|
|
y0 = x.get_low(); y1 = x.get_high();
|
|
return *this;
|
|
};
|
|
// Member function to load from array (unaligned)
|
|
Vec16s & load(void const * p) {
|
|
y0 = _mm_loadu_si128((__m128i const*)p);
|
|
y1 = _mm_loadu_si128((__m128i const*)p + 1);
|
|
return *this;
|
|
}
|
|
// Member function to load from array, aligned by 32
|
|
Vec16s & load_a(void const * p) {
|
|
y0 = _mm_load_si128((__m128i const*)p);
|
|
y1 = _mm_load_si128((__m128i const*)p + 1);
|
|
return *this;
|
|
}
|
|
// Partial load. Load n elements and set the rest to 0
|
|
Vec16s & load_partial(int n, void const * p) {
|
|
if (n <= 0) {
|
|
*this = 0;
|
|
}
|
|
else if (n <= 8) {
|
|
*this = Vec16s(Vec8s().load_partial(n, p), 0);
|
|
}
|
|
else if (n < 16) {
|
|
*this = Vec16s(Vec8s().load(p), Vec8s().load_partial(n-8, (int16_t*)p+8));
|
|
}
|
|
else {
|
|
load(p);
|
|
}
|
|
return *this;
|
|
}
|
|
// Partial store. Store n elements
|
|
void store_partial(int n, void * p) const {
|
|
if (n <= 0) {
|
|
return;
|
|
}
|
|
else if (n <= 8) {
|
|
get_low().store_partial(n, p);
|
|
}
|
|
else if (n < 16) {
|
|
get_low().store(p);
|
|
get_high().store_partial(n-8, (int16_t*)p+8);
|
|
}
|
|
else {
|
|
store(p);
|
|
}
|
|
}
|
|
// cut off vector to n elements. The last 16-n elements are set to zero
|
|
Vec16s & cutoff(int n) {
|
|
*this = Vec32c(*this).cutoff(n * 2);
|
|
return *this;
|
|
}
|
|
// Member function to change a single element in vector
|
|
// Note: This function is inefficient. Use load function if changing more than one element
|
|
Vec16s const & insert(uint32_t index, int16_t value) {
|
|
if (index < 8) {
|
|
y0 = Vec8s(y0).insert(index, value);
|
|
}
|
|
else {
|
|
y1 = Vec8s(y1).insert(index-8, value);
|
|
}
|
|
return *this;
|
|
};
|
|
// Member function extract a single element from vector
|
|
int16_t extract(uint32_t index) const {
|
|
if (index < 8) {
|
|
return Vec8s(y0).extract(index);
|
|
}
|
|
else {
|
|
return Vec8s(y1).extract(index-8);
|
|
}
|
|
}
|
|
// Extract a single element. Use store function if extracting more than one element.
|
|
// Operator [] can only read an element, not write.
|
|
int16_t operator [] (uint32_t index) const {
|
|
return extract(index);
|
|
}
|
|
// Member functions to split into two Vec8s:
|
|
Vec8s get_low() const {
|
|
return y0;
|
|
}
|
|
Vec8s get_high() const {
|
|
return y1;
|
|
}
|
|
static int size () {
|
|
return 16;
|
|
}
|
|
};
|
|
|
|
|
|
/*****************************************************************************
|
|
*
|
|
* Vec16sb: Vector of 16 Booleans for use with Vec16s and Vec16us
|
|
*
|
|
*****************************************************************************/
|
|
|
|
class Vec16sb : public Vec16s {
|
|
public:
|
|
// Default constructor:
|
|
Vec16sb() {
|
|
}
|
|
// Constructor to build from all elements:
|
|
Vec16sb(bool x0, bool x1, bool x2, bool x3, bool x4, bool x5, bool x6, bool x7,
|
|
bool x8, bool x9, bool x10, bool x11, bool x12, bool x13, bool x14, bool x15) :
|
|
Vec16s(-int16_t(x0), -int16_t(x1), -int16_t(x2), -int16_t(x3), -int16_t(x4), -int16_t(x5), -int16_t(x6), -int16_t(x7),
|
|
-int16_t(x8), -int16_t(x9), -int16_t(x10), -int16_t(x11), -int16_t(x12), -int16_t(x13), -int16_t(x14), -int16_t(x15))
|
|
{}
|
|
// Constructor to convert from type Vec256ie
|
|
Vec16sb(Vec256ie const & x) {
|
|
y0 = x.get_low(); y1 = x.get_high();
|
|
}
|
|
// Assignment operator to convert from type Vec256ie
|
|
Vec16sb & operator = (Vec256ie const & x) {
|
|
y0 = x.get_low(); y1 = x.get_high();
|
|
return *this;
|
|
}
|
|
// Constructor to broadcast scalar value:
|
|
Vec16sb(bool b) : Vec16s(-int16_t(b)) {
|
|
}
|
|
// Assignment operator to broadcast scalar value:
|
|
Vec16sb & operator = (bool b) {
|
|
*this = Vec16sb(b);
|
|
return *this;
|
|
}
|
|
private: // Prevent constructing from int, etc.
|
|
Vec16sb(int b);
|
|
Vec16sb & operator = (int x);
|
|
public:
|
|
// Member functions to split into two Vec8s:
|
|
Vec8sb get_low() const {
|
|
return y0;
|
|
}
|
|
Vec8sb get_high() const {
|
|
return y1;
|
|
}
|
|
Vec16sb & insert (int index, bool a) {
|
|
Vec16s::insert(index, -(int)a);
|
|
return *this;
|
|
}
|
|
// Member function extract a single element from vector
|
|
bool extract(uint32_t index) const {
|
|
return Vec16s::extract(index) != 0;
|
|
}
|
|
// Extract a single element. Use store function if extracting more than one element.
|
|
// Operator [] can only read an element, not write.
|
|
bool operator [] (uint32_t index) const {
|
|
return extract(index);
|
|
}
|
|
};
|
|
|
|
|
|
/*****************************************************************************
|
|
*
|
|
* Define operators for Vec16sb
|
|
*
|
|
*****************************************************************************/
|
|
|
|
// vector operator & : bitwise and
|
|
static inline Vec16sb operator & (Vec16sb const & a, Vec16sb const & b) {
|
|
return Vec16sb(Vec256b(a) & Vec256b(b));
|
|
}
|
|
static inline Vec16sb operator && (Vec16sb const & a, Vec16sb const & b) {
|
|
return a & b;
|
|
}
|
|
// vector operator &= : bitwise and
|
|
static inline Vec16sb & operator &= (Vec16sb & a, Vec16sb const & b) {
|
|
a = a & b;
|
|
return a;
|
|
}
|
|
|
|
// vector operator | : bitwise or
|
|
static inline Vec16sb operator | (Vec16sb const & a, Vec16sb const & b) {
|
|
return Vec16sb(Vec256b(a) | Vec256b(b));
|
|
}
|
|
static inline Vec16sb operator || (Vec16sb const & a, Vec16sb const & b) {
|
|
return a | b;
|
|
}
|
|
// vector operator |= : bitwise or
|
|
static inline Vec16sb & operator |= (Vec16sb & a, Vec16sb const & b) {
|
|
a = a | b;
|
|
return a;
|
|
}
|
|
|
|
// vector operator ^ : bitwise xor
|
|
static inline Vec16sb operator ^ (Vec16sb const & a, Vec16sb const & b) {
|
|
return Vec16sb(Vec256b(a) ^ Vec256b(b));
|
|
}
|
|
// vector operator ^= : bitwise xor
|
|
static inline Vec16sb & operator ^= (Vec16sb & a, Vec16sb const & b) {
|
|
a = a ^ b;
|
|
return a;
|
|
}
|
|
|
|
// vector operator ~ : bitwise not
|
|
static inline Vec16sb operator ~ (Vec16sb const & a) {
|
|
return Vec16sb( ~ Vec256b(a));
|
|
}
|
|
|
|
// vector operator ! : element not
|
|
static inline Vec16sb operator ! (Vec16sb const & a) {
|
|
return ~ a;
|
|
}
|
|
|
|
// vector function andnot
|
|
static inline Vec16sb andnot (Vec16sb const & a, Vec16sb const & b) {
|
|
return Vec16sb(andnot(Vec256b(a), Vec256b(b)));
|
|
}
|
|
|
|
|
|
/*****************************************************************************
|
|
*
|
|
* Operators for Vec16s
|
|
*
|
|
*****************************************************************************/
|
|
|
|
// vector operator + : add element by element
|
|
static inline Vec16s operator + (Vec16s const & a, Vec16s const & b) {
|
|
return Vec16s(a.get_low() + b.get_low(), a.get_high() + b.get_high());
|
|
}
|
|
|
|
// vector operator += : add
|
|
static inline Vec16s & operator += (Vec16s & a, Vec16s const & b) {
|
|
a = a + b;
|
|
return a;
|
|
}
|
|
|
|
// postfix operator ++
|
|
static inline Vec16s operator ++ (Vec16s & a, int) {
|
|
Vec16s a0 = a;
|
|
a = a + 1;
|
|
return a0;
|
|
}
|
|
|
|
// prefix operator ++
|
|
static inline Vec16s & operator ++ (Vec16s & a) {
|
|
a = a + 1;
|
|
return a;
|
|
}
|
|
|
|
// vector operator - : subtract element by element
|
|
static inline Vec16s operator - (Vec16s const & a, Vec16s const & b) {
|
|
return Vec16s(a.get_low() - b.get_low(), a.get_high() - b.get_high());
|
|
}
|
|
|
|
// vector operator - : unary minus
|
|
static inline Vec16s operator - (Vec16s const & a) {
|
|
return Vec16s(-a.get_low(), -a.get_high());
|
|
}
|
|
|
|
// vector operator -= : subtract
|
|
static inline Vec16s & operator -= (Vec16s & a, Vec16s const & b) {
|
|
a = a - b;
|
|
return a;
|
|
}
|
|
|
|
// postfix operator --
|
|
static inline Vec16s operator -- (Vec16s & a, int) {
|
|
Vec16s a0 = a;
|
|
a = a - 1;
|
|
return a0;
|
|
}
|
|
|
|
// prefix operator --
|
|
static inline Vec16s & operator -- (Vec16s & a) {
|
|
a = a - 1;
|
|
return a;
|
|
}
|
|
|
|
// vector operator * : multiply element by element
|
|
static inline Vec16s operator * (Vec16s const & a, Vec16s const & b) {
|
|
return Vec16s(a.get_low() * b.get_low(), a.get_high() * b.get_high());
|
|
}
|
|
|
|
// vector operator *= : multiply
|
|
static inline Vec16s & operator *= (Vec16s & a, Vec16s const & b) {
|
|
a = a * b;
|
|
return a;
|
|
}
|
|
|
|
// vector operator / : divide all elements by same integer
|
|
static inline Vec16s operator / (Vec16s const & a, Divisor_s const & d) {
|
|
return Vec16s(a.get_low() / d, a.get_high() / d);
|
|
}
|
|
|
|
// vector operator /= : divide
|
|
static inline Vec16s & operator /= (Vec16s & a, Divisor_s const & d) {
|
|
a = a / d;
|
|
return a;
|
|
}
|
|
|
|
// vector operator << : shift left
|
|
static inline Vec16s operator << (Vec16s const & a, int b) {
|
|
return Vec16s(a.get_low() << b, a.get_high() << b);
|
|
}
|
|
|
|
// vector operator <<= : shift left
|
|
static inline Vec16s & operator <<= (Vec16s & a, int b) {
|
|
a = a << b;
|
|
return a;
|
|
}
|
|
|
|
// vector operator >> : shift right arithmetic
|
|
static inline Vec16s operator >> (Vec16s const & a, int b) {
|
|
return Vec16s(a.get_low() >> b, a.get_high() >> b);
|
|
}
|
|
|
|
// vector operator >>= : shift right arithmetic
|
|
static inline Vec16s & operator >>= (Vec16s & a, int b) {
|
|
a = a >> b;
|
|
return a;
|
|
}
|
|
|
|
// vector operator == : returns true for elements for which a == b
|
|
static inline Vec16sb operator == (Vec16s const & a, Vec16s const & b) {
|
|
return Vec16s(a.get_low() == b.get_low(), a.get_high() == b.get_high());
|
|
}
|
|
|
|
// vector operator != : returns true for elements for which a != b
|
|
static inline Vec16sb operator != (Vec16s const & a, Vec16s const & b) {
|
|
return Vec16s(a.get_low() != b.get_low(), a.get_high() != b.get_high());
|
|
}
|
|
|
|
// vector operator > : returns true for elements for which a > b
|
|
static inline Vec16sb operator > (Vec16s const & a, Vec16s const & b) {
|
|
return Vec16s(a.get_low() > b.get_low(), a.get_high() > b.get_high());
|
|
}
|
|
|
|
// vector operator < : returns true for elements for which a < b
|
|
static inline Vec16sb operator < (Vec16s const & a, Vec16s const & b) {
|
|
return b > a;
|
|
}
|
|
|
|
// vector operator >= : returns true for elements for which a >= b (signed)
|
|
static inline Vec16sb operator >= (Vec16s const & a, Vec16s const & b) {
|
|
return Vec16s(a.get_low() >= b.get_low(), a.get_high() >= b.get_high());
|
|
}
|
|
|
|
// vector operator <= : returns true for elements for which a <= b (signed)
|
|
static inline Vec16sb operator <= (Vec16s const & a, Vec16s const & b) {
|
|
return b >= a;
|
|
}
|
|
|
|
// vector operator & : bitwise and
|
|
static inline Vec16s operator & (Vec16s const & a, Vec16s const & b) {
|
|
return Vec16s(a.get_low() & b.get_low(), a.get_high() & b.get_high());
|
|
}
|
|
static inline Vec16s operator && (Vec16s const & a, Vec16s const & b) {
|
|
return a & b;
|
|
}
|
|
// vector operator &= : bitwise and
|
|
static inline Vec16s & operator &= (Vec16s & a, Vec16s const & b) {
|
|
a = a & b;
|
|
return a;
|
|
}
|
|
|
|
// vector operator | : bitwise or
|
|
static inline Vec16s operator | (Vec16s const & a, Vec16s const & b) {
|
|
return Vec16s(a.get_low() | b.get_low(), a.get_high() | b.get_high());
|
|
}
|
|
static inline Vec16s operator || (Vec16s const & a, Vec16s const & b) {
|
|
return a | b;
|
|
}
|
|
// vector operator |= : bitwise or
|
|
static inline Vec16s & operator |= (Vec16s & a, Vec16s const & b) {
|
|
a = a | b;
|
|
return a;
|
|
}
|
|
|
|
// vector operator ^ : bitwise xor
|
|
static inline Vec16s operator ^ (Vec16s const & a, Vec16s const & b) {
|
|
return Vec16s(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());
|
|
}
|
|
// vector operator ^= : bitwise xor
|
|
static inline Vec16s & operator ^= (Vec16s & a, Vec16s const & b) {
|
|
a = a ^ b;
|
|
return a;
|
|
}
|
|
|
|
// vector operator ~ : bitwise not
|
|
static inline Vec16s operator ~ (Vec16s const & a) {
|
|
return Vec16s(~Vec256b(a));
|
|
}
|
|
|
|
// vector operator ! : logical not, returns true for elements == 0
|
|
static inline Vec16sb operator ! (Vec16s const & a) {
|
|
return Vec16s(!a.get_low(), !a.get_high());
|
|
}
|
|
|
|
// Functions for this class
|
|
|
|
// Select between two operands. Corresponds to this pseudocode:
|
|
// for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i];
|
|
// Each byte in s must be either 0 (false) or -1 (true). No other values are allowed.
|
|
// (s is signed)
|
|
static inline Vec16s select (Vec16sb const & s, Vec16s const & a, Vec16s const & b) {
|
|
return selectb(s,a,b);
|
|
}
|
|
|
|
// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
|
|
static inline Vec16s if_add (Vec16sb const & f, Vec16s const & a, Vec16s const & b) {
|
|
return a + (Vec16s(f) & b);
|
|
}
|
|
|
|
// Horizontal add: Calculates the sum of all vector elements.
|
|
// Overflow will wrap around
|
|
static inline int32_t horizontal_add (Vec16s const & a) {
|
|
return horizontal_add(a.get_low() + a.get_high());
|
|
}
|
|
|
|
// Horizontal add extended: Calculates the sum of all vector elements.
|
|
// Elements are sign extended before adding to avoid overflow
|
|
static inline int32_t horizontal_add_x (Vec16s const & a) {
|
|
return horizontal_add_x(a.get_low()) + horizontal_add_x(a.get_high());
|
|
}
|
|
|
|
// function add_saturated: add element by element, signed with saturation
|
|
static inline Vec16s add_saturated(Vec16s const & a, Vec16s const & b) {
|
|
return Vec16s(add_saturated(a.get_low(),b.get_low()), add_saturated(a.get_high(),b.get_high()));
|
|
}
|
|
|
|
// function sub_saturated: subtract element by element, signed with saturation
|
|
static inline Vec16s sub_saturated(Vec16s const & a, Vec16s const & b) {
|
|
return Vec16s(sub_saturated(a.get_low(),b.get_low()), sub_saturated(a.get_high(),b.get_high()));
|
|
}
|
|
|
|
// function max: a > b ? a : b
|
|
static inline Vec16s max(Vec16s const & a, Vec16s const & b) {
|
|
return Vec16s(max(a.get_low(),b.get_low()), max(a.get_high(),b.get_high()));
|
|
}
|
|
|
|
// function min: a < b ? a : b
|
|
static inline Vec16s min(Vec16s const & a, Vec16s const & b) {
|
|
return Vec16s(min(a.get_low(),b.get_low()), min(a.get_high(),b.get_high()));
|
|
}
|
|
|
|
// function abs: a >= 0 ? a : -a
|
|
static inline Vec16s abs(Vec16s const & a) {
|
|
return Vec16s(abs(a.get_low()), abs(a.get_high()));
|
|
}
|
|
|
|
// function abs_saturated: same as abs, saturate if overflow
|
|
static inline Vec16s abs_saturated(Vec16s const & a) {
|
|
return Vec16s(abs_saturated(a.get_low()), abs_saturated(a.get_high()));
|
|
}
|
|
|
|
// function rotate_left all elements
|
|
// Use negative count to rotate right
|
|
static inline Vec16s rotate_left(Vec16s const & a, int b) {
|
|
return Vec16s(rotate_left(a.get_low(),b), rotate_left(a.get_high(),b));
|
|
}
|
|
|
|
|
|
/*****************************************************************************
|
|
*
|
|
* Vector of 16 16-bit unsigned integers
|
|
*
|
|
*****************************************************************************/
|
|
|
|
class Vec16us : public Vec16s {
|
|
public:
|
|
// Default constructor:
|
|
Vec16us(){
|
|
};
|
|
// Constructor to broadcast the same value into all elements:
|
|
Vec16us(uint32_t i) {
|
|
y1 = y0 = _mm_set1_epi16((int16_t)i);
|
|
};
|
|
// Constructor to build from all elements:
|
|
Vec16us(uint16_t i0, uint16_t i1, uint16_t i2, uint16_t i3, uint16_t i4, uint16_t i5, uint16_t i6, uint16_t i7,
|
|
uint16_t i8, uint16_t i9, uint16_t i10, uint16_t i11, uint16_t i12, uint16_t i13, uint16_t i14, uint16_t i15) {
|
|
y0 = _mm_setr_epi16(i0, i1, i2, i3, i4, i5, i6, i7);
|
|
y1 = _mm_setr_epi16(i8, i9, i10, i11, i12, i13, i14, i15 );
|
|
};
|
|
// Constructor to build from two Vec8us:
|
|
Vec16us(Vec8us const & a0, Vec8us const & a1) {
|
|
y0 = a0; y1 = a1;
|
|
}
|
|
// Constructor to convert from type Vec256ie
|
|
Vec16us(Vec256ie const & x) {
|
|
y0 = x.get_low(); y1 = x.get_high();
|
|
};
|
|
// Assignment operator to convert from type Vec256ie
|
|
Vec16us & operator = (Vec256ie const & x) {
|
|
y0 = x.get_low(); y1 = x.get_high();
|
|
return *this;
|
|
};
|
|
// Member function to load from array (unaligned)
|
|
Vec16us & load(void const * p) {
|
|
y0 = _mm_loadu_si128((__m128i const*)p);
|
|
y1 = _mm_loadu_si128((__m128i const*)p + 1);
|
|
return *this;
|
|
}
|
|
// Member function to load from array, aligned by 32
|
|
Vec16us & load_a(void const * p) {
|
|
y0 = _mm_load_si128((__m128i const*)p);
|
|
y1 = _mm_load_si128((__m128i const*)p + 1);
|
|
return *this;
|
|
}
|
|
// Member function to change a single element in vector
|
|
// Note: This function is inefficient. Use load function if changing more than one element
|
|
Vec16us const & insert(uint32_t index, uint16_t value) {
|
|
Vec16s::insert(index, value);
|
|
return *this;
|
|
};
|
|
// Member function extract a single element from vector
|
|
uint16_t extract(uint32_t index) const {
|
|
return Vec16s::extract(index);
|
|
}
|
|
// Extract a single element. Use store function if extracting more than one element.
|
|
// Operator [] can only read an element, not write.
|
|
uint16_t operator [] (uint32_t index) const {
|
|
return extract(index);
|
|
}
|
|
// Member functions to split into two Vec8us:
|
|
Vec8us get_low() const {
|
|
return y0;
|
|
}
|
|
Vec8us get_high() const {
|
|
return y1;
|
|
}
|
|
};
|
|
|
|
// Define operators for this class
|
|
|
|
// vector operator + : add
|
|
static inline Vec16us operator + (Vec16us const & a, Vec16us const & b) {
|
|
return Vec16us(a.get_low() + b.get_low(), a.get_high() + b.get_high());
|
|
}
|
|
|
|
// vector operator - : subtract
|
|
static inline Vec16us operator - (Vec16us const & a, Vec16us const & b) {
|
|
return Vec16us(a.get_low() - b.get_low(), a.get_high() - b.get_high());
|
|
}
|
|
|
|
// vector operator * : multiply
|
|
static inline Vec16us operator * (Vec16us const & a, Vec16us const & b) {
|
|
return Vec16us(a.get_low() * b.get_low(), a.get_high() * b.get_high());
|
|
}
|
|
|
|
// vector operator / : divide
|
|
static inline Vec16us operator / (Vec16us const & a, Divisor_us const & d) {
|
|
return Vec16us(a.get_low() / d, a.get_high() / d);
|
|
}
|
|
|
|
// vector operator /= : divide
|
|
static inline Vec16us & operator /= (Vec16us & a, Divisor_us const & d) {
|
|
a = a / d;
|
|
return a;
|
|
}
|
|
|
|
// vector operator >> : shift right logical all elements
|
|
static inline Vec16us operator >> (Vec16us const & a, uint32_t b) {
|
|
return Vec16us(a.get_low() >> b, a.get_high() >> b);
|
|
}
|
|
|
|
// vector operator >> : shift right logical all elements
|
|
static inline Vec16us operator >> (Vec16us const & a, int b) {
|
|
return a >> (uint32_t)b;
|
|
}
|
|
|
|
// vector operator >>= : shift right artihmetic
|
|
static inline Vec16us & operator >>= (Vec16us & a, uint32_t b) {
|
|
a = a >> b;
|
|
return a;
|
|
}
|
|
|
|
// vector operator << : shift left all elements
|
|
static inline Vec16us operator << (Vec16us const & a, uint32_t b) {
|
|
return Vec16us(a.get_low() << b, a.get_high() << b);
|
|
}
|
|
|
|
// vector operator << : shift left all elements
|
|
static inline Vec16us operator << (Vec16us const & a, int32_t b) {
|
|
return a << (uint32_t)b;
|
|
}
|
|
|
|
// vector operator >= : returns true for elements for which a >= b (unsigned)
|
|
static inline Vec16sb operator >= (Vec16us const & a, Vec16us const & b) {
|
|
return Vec16s(a.get_low() >= b.get_low(), a.get_high() >= b.get_high());
|
|
}
|
|
|
|
// vector operator <= : returns true for elements for which a <= b (unsigned)
|
|
static inline Vec16sb operator <= (Vec16us const & a, Vec16us const & b) {
|
|
return b >= a;
|
|
}
|
|
|
|
// vector operator > : returns true for elements for which a > b (unsigned)
|
|
static inline Vec16sb operator > (Vec16us const & a, Vec16us const & b) {
|
|
return Vec16s(a.get_low() > b.get_low(), a.get_high() > b.get_high());
|
|
}
|
|
|
|
// vector operator < : returns true for elements for which a < b (unsigned)
|
|
static inline Vec16sb operator < (Vec16us const & a, Vec16us const & b) {
|
|
return b > a;
|
|
}
|
|
|
|
// vector operator & : bitwise and
|
|
static inline Vec16us operator & (Vec16us const & a, Vec16us const & b) {
|
|
return Vec16us(a.get_low() & b.get_low(), a.get_high() & b.get_high());
|
|
}
|
|
static inline Vec16us operator && (Vec16us const & a, Vec16us const & b) {
|
|
return a & b;
|
|
}
|
|
|
|
// vector operator | : bitwise or
|
|
static inline Vec16us operator | (Vec16us const & a, Vec16us const & b) {
|
|
return Vec16us(a.get_low() | b.get_low(), a.get_high() | b.get_high());
|
|
}
|
|
static inline Vec16us operator || (Vec16us const & a, Vec16us const & b) {
|
|
return a | b;
|
|
}
|
|
|
|
// vector operator ^ : bitwise xor
|
|
static inline Vec16us operator ^ (Vec16us const & a, Vec16us const & b) {
|
|
return Vec16us(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());
|
|
}
|
|
|
|
// vector operator ~ : bitwise not
|
|
static inline Vec16us operator ~ (Vec16us const & a) {
|
|
return Vec16us(~ Vec256b(a));
|
|
}
|
|
|
|
|
|
// Functions for this class
|
|
|
|
// Select between two operands. Corresponds to this pseudocode:
|
|
// for (int i = 0; i < 8; i++) result[i] = s[i] ? a[i] : b[i];
|
|
// Each word in s must be either 0 (false) or -1 (true). No other values are allowed.
|
|
// (s is signed)
|
|
static inline Vec16us select (Vec16sb const & s, Vec16us const & a, Vec16us const & b) {
|
|
return selectb(s,a,b);
|
|
}
|
|
|
|
// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
|
|
static inline Vec16us if_add (Vec16sb const & f, Vec16us const & a, Vec16us const & b) {
|
|
return a + (Vec16us(f) & b);
|
|
}
|
|
|
|
// Horizontal add: Calculates the sum of all vector elements.
|
|
// Overflow will wrap around
|
|
static inline uint32_t horizontal_add (Vec16us const & a) {
|
|
return horizontal_add(a.get_low() + a.get_high());
|
|
}
|
|
|
|
// Horizontal add extended: Calculates the sum of all vector elements.
|
|
// Each element is zero-extended before addition to avoid overflow
|
|
static inline uint32_t horizontal_add_x (Vec16us const & a) {
|
|
return horizontal_add_x(a.get_low()) + horizontal_add_x(a.get_high());
|
|
}
|
|
|
|
// function add_saturated: add element by element, unsigned with saturation
|
|
static inline Vec16us add_saturated(Vec16us const & a, Vec16us const & b) {
|
|
return Vec16us(add_saturated(a.get_low(),b.get_low()), add_saturated(a.get_high(),b.get_high()));
|
|
}
|
|
|
|
// function sub_saturated: subtract element by element, unsigned with saturation
|
|
static inline Vec16us sub_saturated(Vec16us const & a, Vec16us const & b) {
|
|
return Vec16us(sub_saturated(a.get_low(),b.get_low()), sub_saturated(a.get_high(),b.get_high()));
|
|
}
|
|
|
|
// function max: a > b ? a : b
|
|
static inline Vec16us max(Vec16us const & a, Vec16us const & b) {
|
|
return Vec16us(max(a.get_low(),b.get_low()), max(a.get_high(),b.get_high()));
|
|
}
|
|
|
|
// function min: a < b ? a : b
|
|
static inline Vec16us min(Vec16us const & a, Vec16us const & b) {
|
|
return Vec16us(min(a.get_low(),b.get_low()), min(a.get_high(),b.get_high()));
|
|
}
|
|
|
|
|
|
|
|
/*****************************************************************************
|
|
*
|
|
* Vector of 8 32-bit signed integers
|
|
*
|
|
*****************************************************************************/
|
|
|
|
class Vec8i : public Vec256b {
|
|
public:
|
|
// Default constructor:
|
|
Vec8i() {
|
|
}
|
|
// Constructor to broadcast the same value into all elements:
|
|
Vec8i(int i) {
|
|
y1 = y0 = _mm_set1_epi32(i);
|
|
}
|
|
// Constructor to build from all elements:
|
|
Vec8i(int32_t i0, int32_t i1, int32_t i2, int32_t i3, int32_t i4, int32_t i5, int32_t i6, int32_t i7) {
|
|
y0 = _mm_setr_epi32(i0, i1, i2, i3);
|
|
y1 = _mm_setr_epi32(i4, i5, i6, i7);
|
|
}
|
|
// Constructor to build from two Vec4i:
|
|
Vec8i(Vec4i const & a0, Vec4i const & a1) {
|
|
y0 = a0; y1 = a1;
|
|
}
|
|
// Constructor to convert from type Vec256ie
|
|
Vec8i(Vec256ie const & x) {
|
|
y0 = x.get_low(); y1 = x.get_high();
|
|
}
|
|
// Assignment operator to convert from type Vec256ie
|
|
Vec8i & operator = (Vec256ie const & x) {
|
|
y0 = x.get_low(); y1 = x.get_high();
|
|
return *this;
|
|
}
|
|
// Member function to load from array (unaligned)
|
|
Vec8i & load(void const * p) {
|
|
y0 = _mm_loadu_si128((__m128i const*)p);
|
|
y1 = _mm_loadu_si128((__m128i const*)p + 1);
|
|
return *this;
|
|
}
|
|
// Member function to load from array, aligned by 32
|
|
Vec8i & load_a(void const * p) {
|
|
y0 = _mm_load_si128((__m128i const*)p);
|
|
y1 = _mm_load_si128((__m128i const*)p + 1);
|
|
return *this;
|
|
}
|
|
// Partial load. Load n elements and set the rest to 0
|
|
Vec8i & load_partial(int n, void const * p) {
|
|
if (n <= 0) {
|
|
*this = 0;
|
|
}
|
|
else if (n <= 4) {
|
|
*this = Vec8i(Vec4i().load_partial(n, p), 0);
|
|
}
|
|
else if (n < 8) {
|
|
*this = Vec8i(Vec4i().load(p), Vec4i().load_partial(n-4, (int32_t*)p+4));
|
|
}
|
|
else {
|
|
load(p);
|
|
}
|
|
return *this;
|
|
}
|
|
// Partial store. Store n elements
|
|
void store_partial(int n, void * p) const {
|
|
if (n <= 0) {
|
|
return;
|
|
}
|
|
else if (n <= 4) {
|
|
get_low().store_partial(n, p);
|
|
}
|
|
else if (n < 8) {
|
|
get_low().store(p);
|
|
get_high().store_partial(n-4, (int32_t*)p+4);
|
|
}
|
|
else {
|
|
store(p);
|
|
}
|
|
}
|
|
// cut off vector to n elements. The last 8-n elements are set to zero
|
|
Vec8i & cutoff(int n) {
|
|
*this = Vec32c(*this).cutoff(n * 4);
|
|
return *this;
|
|
}
|
|
// Member function to change a single element in vector
|
|
// Note: This function is inefficient. Use load function if changing more than one element
|
|
Vec8i const & insert(uint32_t index, int32_t value) {
|
|
if (index < 4) {
|
|
y0 = Vec4i(y0).insert(index, value);
|
|
}
|
|
else {
|
|
y1 = Vec4i(y1).insert(index-4, value);
|
|
}
|
|
return *this;
|
|
}
|
|
// Member function extract a single element from vector
|
|
// Note: This function is inefficient. Use store function if extracting more than one element
|
|
int32_t extract(uint32_t index) const {
|
|
if (index < 4) {
|
|
return Vec4i(y0).extract(index);
|
|
}
|
|
else {
|
|
return Vec4i(y1).extract(index-4);
|
|
}
|
|
}
|
|
// Extract a single element. Use store function if extracting more than one element.
|
|
// Operator [] can only read an element, not write.
|
|
int32_t operator [] (uint32_t index) const {
|
|
return extract(index);
|
|
}
|
|
// Member functions to split into two Vec4i:
|
|
Vec4i get_low() const {
|
|
return y0;
|
|
}
|
|
Vec4i get_high() const {
|
|
return y1;
|
|
}
|
|
static int size () {
|
|
return 8;
|
|
}
|
|
};
|
|
|
|
|
|
/*****************************************************************************
|
|
*
|
|
* Vec8ib: Vector of 8 Booleans for use with Vec8i and Vec8ui
|
|
*
|
|
*****************************************************************************/
|
|
|
|
class Vec8ib : public Vec8i {
|
|
public:
|
|
// Default constructor:
|
|
Vec8ib() {
|
|
}
|
|
// Constructor to build from all elements:
|
|
Vec8ib(bool x0, bool x1, bool x2, bool x3, bool x4, bool x5, bool x6, bool x7) :
|
|
Vec8i(-int32_t(x0), -int32_t(x1), -int32_t(x2), -int32_t(x3), -int32_t(x4), -int32_t(x5), -int32_t(x6), -int32_t(x7))
|
|
{}
|
|
// Constructor to convert from type Vec256ie
|
|
Vec8ib(Vec256ie const & x) {
|
|
y0 = x.get_low(); y1 = x.get_high();
|
|
}
|
|
// Assignment operator to convert from type Vec256ie
|
|
Vec8ib & operator = (Vec256ie const & x) {
|
|
y0 = x.get_low(); y1 = x.get_high();
|
|
return *this;
|
|
}
|
|
// Constructor to broadcast scalar value:
|
|
Vec8ib(bool b) : Vec8i(-int32_t(b)) {
|
|
}
|
|
// Assignment operator to broadcast scalar value:
|
|
Vec8ib & operator = (bool b) {
|
|
*this = Vec8ib(b);
|
|
return *this;
|
|
}
|
|
private: // Prevent constructing from int, etc.
|
|
Vec8ib(int b);
|
|
Vec8ib & operator = (int x);
|
|
public:
|
|
// Member functions to split into two Vec4i:
|
|
Vec4ib get_low() const {
|
|
return y0;
|
|
}
|
|
Vec4ib get_high() const {
|
|
return y1;
|
|
}
|
|
Vec8ib & insert (int index, bool a) {
|
|
Vec8i::insert(index, -(int)a);
|
|
return *this;
|
|
};
|
|
// Member function extract a single element from vector
|
|
// Note: This function is inefficient. Use store function if extracting more than one element
|
|
bool extract(uint32_t index) const {
|
|
return Vec8i::extract(index) != 0;
|
|
}
|
|
// Extract a single element. Use store function if extracting more than one element.
|
|
// Operator [] can only read an element, not write.
|
|
bool operator [] (uint32_t index) const {
|
|
return extract(index);
|
|
}
|
|
};
|
|
|
|
/*****************************************************************************
|
|
*
|
|
* Define operators for Vec8ib
|
|
*
|
|
*****************************************************************************/
|
|
|
|
// vector operator & : bitwise and
|
|
static inline Vec8ib operator & (Vec8ib const & a, Vec8ib const & b) {
|
|
return Vec8ib(Vec256b(a) & Vec256b(b));
|
|
}
|
|
static inline Vec8ib operator && (Vec8ib const & a, Vec8ib const & b) {
|
|
return a & b;
|
|
}
|
|
// vector operator &= : bitwise and
|
|
static inline Vec8ib & operator &= (Vec8ib & a, Vec8ib const & b) {
|
|
a = a & b;
|
|
return a;
|
|
}
|
|
|
|
// vector operator | : bitwise or
|
|
static inline Vec8ib operator | (Vec8ib const & a, Vec8ib const & b) {
|
|
return Vec8ib(Vec256b(a) | Vec256b(b));
|
|
}
|
|
static inline Vec8ib operator || (Vec8ib const & a, Vec8ib const & b) {
|
|
return a | b;
|
|
}
|
|
// vector operator |= : bitwise or
|
|
static inline Vec8ib & operator |= (Vec8ib & a, Vec8ib const & b) {
|
|
a = a | b;
|
|
return a;
|
|
}
|
|
|
|
// vector operator ^ : bitwise xor
|
|
static inline Vec8ib operator ^ (Vec8ib const & a, Vec8ib const & b) {
|
|
return Vec8ib(Vec256b(a) ^ Vec256b(b));
|
|
}
|
|
// vector operator ^= : bitwise xor
|
|
static inline Vec8ib & operator ^= (Vec8ib & a, Vec8ib const & b) {
|
|
a = a ^ b;
|
|
return a;
|
|
}
|
|
|
|
// vector operator ~ : bitwise not
|
|
static inline Vec8ib operator ~ (Vec8ib const & a) {
|
|
return Vec8ib( ~ Vec256b(a));
|
|
}
|
|
|
|
// vector operator ! : element not
|
|
static inline Vec8ib operator ! (Vec8ib const & a) {
|
|
return ~ a;
|
|
}
|
|
|
|
// vector function andnot
|
|
static inline Vec8ib andnot (Vec8ib const & a, Vec8ib const & b) {
|
|
return Vec8ib(andnot(Vec256b(a), Vec256b(b)));
|
|
}
|
|
|
|
|
|
/*****************************************************************************
|
|
*
|
|
* Operators for Vec8i
|
|
*
|
|
*****************************************************************************/
|
|
|
|
// vector operator + : add element by element
|
|
static inline Vec8i operator + (Vec8i const & a, Vec8i const & b) {
|
|
return Vec8i(a.get_low() + b.get_low(), a.get_high() + b.get_high());
|
|
}
|
|
|
|
// vector operator += : add
|
|
static inline Vec8i & operator += (Vec8i & a, Vec8i const & b) {
|
|
a = a + b;
|
|
return a;
|
|
}
|
|
|
|
// postfix operator ++
|
|
static inline Vec8i operator ++ (Vec8i & a, int) {
|
|
Vec8i a0 = a;
|
|
a = a + 1;
|
|
return a0;
|
|
}
|
|
|
|
// prefix operator ++
|
|
static inline Vec8i & operator ++ (Vec8i & a) {
|
|
a = a + 1;
|
|
return a;
|
|
}
|
|
|
|
// vector operator - : subtract element by element
|
|
static inline Vec8i operator - (Vec8i const & a, Vec8i const & b) {
|
|
return Vec8i(a.get_low() - b.get_low(), a.get_high() - b.get_high());
|
|
}
|
|
|
|
// vector operator - : unary minus
|
|
static inline Vec8i operator - (Vec8i const & a) {
|
|
return Vec8i(-a.get_low(), -a.get_high());
|
|
}
|
|
|
|
// vector operator -= : subtract
|
|
static inline Vec8i & operator -= (Vec8i & a, Vec8i const & b) {
|
|
a = a - b;
|
|
return a;
|
|
}
|
|
|
|
// postfix operator --
|
|
static inline Vec8i operator -- (Vec8i & a, int) {
|
|
Vec8i a0 = a;
|
|
a = a - 1;
|
|
return a0;
|
|
}
|
|
|
|
// prefix operator --
|
|
static inline Vec8i & operator -- (Vec8i & a) {
|
|
a = a - 1;
|
|
return a;
|
|
}
|
|
|
|
// vector operator * : multiply element by element
|
|
static inline Vec8i operator * (Vec8i const & a, Vec8i const & b) {
|
|
return Vec8i(a.get_low() * b.get_low(), a.get_high() * b.get_high());
|
|
}
|
|
|
|
// vector operator *= : multiply
|
|
static inline Vec8i & operator *= (Vec8i & a, Vec8i const & b) {
|
|
a = a * b;
|
|
return a;
|
|
}
|
|
|
|
// vector operator / : divide all elements by same integer
|
|
static inline Vec8i operator / (Vec8i const & a, Divisor_i const & d) {
|
|
return Vec8i(a.get_low() / d, a.get_high() / d);
|
|
}
|
|
|
|
// vector operator /= : divide
|
|
static inline Vec8i & operator /= (Vec8i & a, Divisor_i const & d) {
|
|
a = a / d;
|
|
return a;
|
|
}
|
|
|
|
// vector operator << : shift left
|
|
static inline Vec8i operator << (Vec8i const & a, int32_t b) {
|
|
return Vec8i(a.get_low() << b, a.get_high() << b);
|
|
}
|
|
|
|
// vector operator <<= : shift left
|
|
static inline Vec8i & operator <<= (Vec8i & a, int32_t b) {
|
|
a = a << b;
|
|
return a;
|
|
}
|
|
|
|
// vector operator >> : shift right arithmetic
|
|
static inline Vec8i operator >> (Vec8i const & a, int32_t b) {
|
|
return Vec8i(a.get_low() >> b, a.get_high() >> b);
|
|
}
|
|
|
|
// vector operator >>= : shift right arithmetic
|
|
static inline Vec8i & operator >>= (Vec8i & a, int32_t b) {
|
|
a = a >> b;
|
|
return a;
|
|
}
|
|
|
|
// vector operator == : returns true for elements for which a == b
|
|
static inline Vec8ib operator == (Vec8i const & a, Vec8i const & b) {
|
|
return Vec8i(a.get_low() == b.get_low(), a.get_high() == b.get_high());
|
|
}
|
|
|
|
// vector operator != : returns true for elements for which a != b
|
|
static inline Vec8ib operator != (Vec8i const & a, Vec8i const & b) {
|
|
return Vec8i(a.get_low() != b.get_low(), a.get_high() != b.get_high());
|
|
}
|
|
|
|
// vector operator > : returns true for elements for which a > b
|
|
static inline Vec8ib operator > (Vec8i const & a, Vec8i const & b) {
|
|
return Vec8i(a.get_low() > b.get_low(), a.get_high() > b.get_high());
|
|
}
|
|
|
|
// vector operator < : returns true for elements for which a < b
|
|
static inline Vec8ib operator < (Vec8i const & a, Vec8i const & b) {
|
|
return b > a;
|
|
}
|
|
|
|
// vector operator >= : returns true for elements for which a >= b (signed)
|
|
static inline Vec8ib operator >= (Vec8i const & a, Vec8i const & b) {
|
|
return Vec8i(a.get_low() >= b.get_low(), a.get_high() >= b.get_high());
|
|
}
|
|
|
|
// vector operator <= : returns true for elements for which a <= b (signed)
|
|
static inline Vec8ib operator <= (Vec8i const & a, Vec8i const & b) {
|
|
return b >= a;
|
|
}
|
|
|
|
// vector operator & : bitwise and
|
|
static inline Vec8i operator & (Vec8i const & a, Vec8i const & b) {
|
|
return Vec8i(a.get_low() & b.get_low(), a.get_high() & b.get_high());
|
|
}
|
|
static inline Vec8i operator && (Vec8i const & a, Vec8i const & b) {
|
|
return a & b;
|
|
}
|
|
// vector operator &= : bitwise and
|
|
static inline Vec8i & operator &= (Vec8i & a, Vec8i const & b) {
|
|
a = a & b;
|
|
return a;
|
|
}
|
|
|
|
// vector operator | : bitwise or
|
|
static inline Vec8i operator | (Vec8i const & a, Vec8i const & b) {
|
|
return Vec8i(a.get_low() | b.get_low(), a.get_high() | b.get_high());
|
|
}
|
|
static inline Vec8i operator || (Vec8i const & a, Vec8i const & b) {
|
|
return a | b;
|
|
}
|
|
// vector operator |= : bitwise or
|
|
static inline Vec8i & operator |= (Vec8i & a, Vec8i const & b) {
|
|
a = a | b;
|
|
return a;
|
|
}
|
|
|
|
// vector operator ^ : bitwise xor
|
|
static inline Vec8i operator ^ (Vec8i const & a, Vec8i const & b) {
|
|
return Vec8i(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());
|
|
}
|
|
// vector operator ^= : bitwise xor
|
|
static inline Vec8i & operator ^= (Vec8i & a, Vec8i const & b) {
|
|
a = a ^ b;
|
|
return a;
|
|
}
|
|
|
|
// vector operator ~ : bitwise not
|
|
static inline Vec8i operator ~ (Vec8i const & a) {
|
|
return Vec8i(~a.get_low(), ~a.get_high());
|
|
}
|
|
|
|
// vector operator ! : returns true for elements == 0
|
|
static inline Vec8ib operator ! (Vec8i const & a) {
|
|
return Vec8i(!a.get_low(), !a.get_high());
|
|
}
|
|
|
|
// Functions for this class
|
|
|
|
// Select between two operands. Corresponds to this pseudocode:
|
|
// for (int i = 0; i < 8; i++) result[i] = s[i] ? a[i] : b[i];
|
|
// Each byte in s must be either 0 (false) or -1 (true). No other values are allowed.
|
|
// (s is signed)
|
|
static inline Vec8i select (Vec8ib const & s, Vec8i const & a, Vec8i const & b) {
|
|
return selectb(s,a,b);
|
|
}
|
|
|
|
// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
|
|
static inline Vec8i if_add (Vec8ib const & f, Vec8i const & a, Vec8i const & b) {
|
|
return a + (Vec8i(f) & b);
|
|
}
|
|
|
|
// Horizontal add: Calculates the sum of all vector elements.
|
|
// Overflow will wrap around
|
|
static inline int32_t horizontal_add (Vec8i const & a) {
|
|
return horizontal_add(a.get_low() + a.get_high());
|
|
}
|
|
|
|
// Horizontal add extended: Calculates the sum of all vector elements.
|
|
// Elements are sign extended before adding to avoid overflow
|
|
static inline int64_t horizontal_add_x (Vec8i const & a) {
|
|
return horizontal_add_x(a.get_low()) + horizontal_add_x(a.get_high());
|
|
}
|
|
|
|
// function add_saturated: add element by element, signed with saturation
|
|
static inline Vec8i add_saturated(Vec8i const & a, Vec8i const & b) {
|
|
return Vec8i(add_saturated(a.get_low(),b.get_low()), add_saturated(a.get_high(),b.get_high()));
|
|
}
|
|
|
|
// function sub_saturated: subtract element by element, signed with saturation
|
|
static inline Vec8i sub_saturated(Vec8i const & a, Vec8i const & b) {
|
|
return Vec8i(sub_saturated(a.get_low(),b.get_low()), sub_saturated(a.get_high(),b.get_high()));
|
|
}
|
|
|
|
// function max: a > b ? a : b
|
|
static inline Vec8i max(Vec8i const & a, Vec8i const & b) {
|
|
return Vec8i(max(a.get_low(),b.get_low()), max(a.get_high(),b.get_high()));
|
|
}
|
|
|
|
// function min: a < b ? a : b
|
|
static inline Vec8i min(Vec8i const & a, Vec8i const & b) {
|
|
return Vec8i(min(a.get_low(),b.get_low()), min(a.get_high(),b.get_high()));
|
|
}
|
|
|
|
// function abs: a >= 0 ? a : -a
|
|
static inline Vec8i abs(Vec8i const & a) {
|
|
return Vec8i(abs(a.get_low()), abs(a.get_high()));
|
|
}
|
|
|
|
// function abs_saturated: same as abs, saturate if overflow
|
|
static inline Vec8i abs_saturated(Vec8i const & a) {
|
|
return Vec8i(abs_saturated(a.get_low()), abs_saturated(a.get_high()));
|
|
}
|
|
|
|
// function rotate_left all elements
|
|
// Use negative count to rotate right
|
|
static inline Vec8i rotate_left(Vec8i const & a, int b) {
|
|
return Vec8i(rotate_left(a.get_low(),b), rotate_left(a.get_high(),b));
|
|
}
|
|
|
|
|
|
/*****************************************************************************
|
|
*
|
|
* Vector of 4 32-bit unsigned integers
|
|
*
|
|
*****************************************************************************/
|
|
|
|
class Vec8ui : public Vec8i {
|
|
public:
|
|
// Default constructor:
|
|
Vec8ui() {
|
|
};
|
|
// Constructor to broadcast the same value into all elements:
|
|
Vec8ui(uint32_t i) {
|
|
y1 = y0 = _mm_set1_epi32(i);
|
|
};
|
|
// Constructor to build from all elements:
|
|
Vec8ui(uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3, uint32_t i4, uint32_t i5, uint32_t i6, uint32_t i7) {
|
|
y0 = _mm_setr_epi32(i0, i1, i2, i3);
|
|
y1 = _mm_setr_epi32(i4, i5, i6, i7);
|
|
};
|
|
// Constructor to build from two Vec4ui:
|
|
Vec8ui(Vec4ui const & a0, Vec4ui const & a1) {
|
|
y0 = a0; y1 = a1;
|
|
}
|
|
// Constructor to convert from type Vec256ie
|
|
Vec8ui(Vec256ie const & x) {
|
|
y0 = x.get_low(); y1 = x.get_high();
|
|
};
|
|
// Assignment operator to convert from type Vec256ie
|
|
Vec8ui & operator = (Vec256ie const & x) {
|
|
y0 = x.get_low(); y1 = x.get_high();
|
|
return *this;
|
|
};
|
|
// Member function to load from array (unaligned)
|
|
Vec8ui & load(void const * p) {
|
|
y0 = _mm_loadu_si128((__m128i const*)p);
|
|
y1 = _mm_loadu_si128((__m128i const*)p + 1);
|
|
return *this;
|
|
}
|
|
// Member function to load from array, aligned by 32
|
|
Vec8ui & load_a(void const * p) {
|
|
y0 = _mm_load_si128((__m128i const*)p);
|
|
y1 = _mm_load_si128((__m128i const*)p + 1);
|
|
return *this;
|
|
}
|
|
// Member function to change a single element in vector
|
|
// Note: This function is inefficient. Use load function if changing more than one element
|
|
Vec8ui const & insert(uint32_t index, uint32_t value) {
|
|
Vec8i::insert(index, value);
|
|
return *this;
|
|
}
|
|
// Member function extract a single element from vector
|
|
uint32_t extract(uint32_t index) const {
|
|
return Vec8i::extract(index);
|
|
}
|
|
// Extract a single element. Use store function if extracting more than one element.
|
|
// Operator [] can only read an element, not write.
|
|
uint32_t operator [] (uint32_t index) const {
|
|
return extract(index);
|
|
}
|
|
// Member functions to split into two Vec4ui:
|
|
Vec4ui get_low() const {
|
|
return y0;
|
|
}
|
|
Vec4ui get_high() const {
|
|
return y1;
|
|
}
|
|
};
|
|
|
|
// Define operators for this class
|
|
|
|
// vector operator + : add
|
|
static inline Vec8ui operator + (Vec8ui const & a, Vec8ui const & b) {
|
|
return Vec8ui (Vec8i(a) + Vec8i(b));
|
|
}
|
|
|
|
// vector operator - : subtract
|
|
static inline Vec8ui operator - (Vec8ui const & a, Vec8ui const & b) {
|
|
return Vec8ui (Vec8i(a) - Vec8i(b));
|
|
}
|
|
|
|
// vector operator * : multiply
|
|
static inline Vec8ui operator * (Vec8ui const & a, Vec8ui const & b) {
|
|
return Vec8ui (Vec8i(a) * Vec8i(b));
|
|
}
|
|
|
|
// vector operator / : divide all elements by same integer
|
|
static inline Vec8ui operator / (Vec8ui const & a, Divisor_ui const & d) {
|
|
return Vec8ui(a.get_low() / d, a.get_high() / d);
|
|
}
|
|
|
|
// vector operator /= : divide
|
|
static inline Vec8ui & operator /= (Vec8ui & a, Divisor_ui const & d) {
|
|
a = a / d;
|
|
return a;
|
|
}
|
|
|
|
// vector operator >> : shift right logical all elements
|
|
static inline Vec8ui operator >> (Vec8ui const & a, uint32_t b) {
|
|
return Vec8ui(a.get_low() >> b, a.get_high() >> b);
|
|
}
|
|
|
|
// vector operator >> : shift right logical all elements
|
|
static inline Vec8ui operator >> (Vec8ui const & a, int32_t b) {
|
|
return a >> (uint32_t)b;
|
|
}
|
|
|
|
// vector operator >>= : shift right logical
|
|
static inline Vec8ui & operator >>= (Vec8ui & a, uint32_t b) {
|
|
a = a >> b;
|
|
return a;
|
|
}
|
|
|
|
// vector operator >>= : shift right logical
|
|
static inline Vec8ui & operator >>= (Vec8ui & a, int32_t b) {
|
|
a = a >> b;
|
|
return a;
|
|
}
|
|
|
|
// vector operator << : shift left all elements
|
|
static inline Vec8ui operator << (Vec8ui const & a, uint32_t b) {
|
|
return Vec8ui ((Vec8i)a << (int32_t)b);
|
|
}
|
|
|
|
// vector operator << : shift left all elements
|
|
static inline Vec8ui operator << (Vec8ui const & a, int32_t b) {
|
|
return Vec8ui ((Vec8i)a << (int32_t)b);
|
|
}
|
|
|
|
// vector operator > : returns true for elements for which a > b (unsigned)
|
|
static inline Vec8ib operator > (Vec8ui const & a, Vec8ui const & b) {
|
|
return Vec8i(a.get_low() > b.get_low(), a.get_high() > b.get_high());
|
|
}
|
|
|
|
// vector operator < : returns true for elements for which a < b (unsigned)
|
|
static inline Vec8ib operator < (Vec8ui const & a, Vec8ui const & b) {
|
|
return b > a;
|
|
}
|
|
|
|
// vector operator >= : returns true for elements for which a >= b (unsigned)
|
|
static inline Vec8ib operator >= (Vec8ui const & a, Vec8ui const & b) {
|
|
return Vec8i(a.get_low() >= b.get_low(), a.get_high() >= b.get_high());
|
|
}
|
|
|
|
// vector operator <= : returns true for elements for which a <= b (unsigned)
|
|
static inline Vec8ib operator <= (Vec8ui const & a, Vec8ui const & b) {
|
|
return b >= a;
|
|
}
|
|
|
|
// vector operator & : bitwise and
|
|
static inline Vec8ui operator & (Vec8ui const & a, Vec8ui const & b) {
|
|
return Vec8ui(a.get_low() & b.get_low(), a.get_high() & b.get_high());
|
|
}
|
|
static inline Vec8ui operator && (Vec8ui const & a, Vec8ui const & b) {
|
|
return a & b;
|
|
}
|
|
|
|
// vector operator | : bitwise or
|
|
static inline Vec8ui operator | (Vec8ui const & a, Vec8ui const & b) {
|
|
return Vec8ui(a.get_low() | b.get_low(), a.get_high() | b.get_high());
|
|
}
|
|
static inline Vec8ui operator || (Vec8ui const & a, Vec8ui const & b) {
|
|
return a | b;
|
|
}
|
|
|
|
// vector operator ^ : bitwise xor
|
|
static inline Vec8ui operator ^ (Vec8ui const & a, Vec8ui const & b) {
|
|
return Vec8ui(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());
|
|
}
|
|
|
|
// vector operator ~ : bitwise not
|
|
static inline Vec8ui operator ~ (Vec8ui const & a) {
|
|
return Vec8ui(~a.get_low(), ~a.get_high());
|
|
}
|
|
|
|
// Functions for this class
|
|
|
|
// Select between two operands. Corresponds to this pseudocode:
|
|
// for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i];
|
|
// Each word in s must be either 0 (false) or -1 (true). No other values are allowed.
|
|
// (s is signed)
|
|
static inline Vec8ui select (Vec8ib const & s, Vec8ui const & a, Vec8ui const & b) {
|
|
return selectb(s,a,b);
|
|
}
|
|
|
|
// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
|
|
static inline Vec8ui if_add (Vec8ib const & f, Vec8ui const & a, Vec8ui const & b) {
|
|
return a + (Vec8ui(f) & b);
|
|
}
|
|
|
|
// Horizontal add: Calculates the sum of all vector elements.
|
|
// Overflow will wrap around
|
|
static inline uint32_t horizontal_add (Vec8ui const & a) {
|
|
return horizontal_add((Vec8i)a);
|
|
}
|
|
|
|
// Horizontal add extended: Calculates the sum of all vector elements.
|
|
// Elements are zero extended before adding to avoid overflow
|
|
static inline uint64_t horizontal_add_x (Vec8ui const & a) {
|
|
return horizontal_add_x(a.get_low()) + horizontal_add_x(a.get_high());
|
|
}
|
|
|
|
// function add_saturated: add element by element, unsigned with saturation
|
|
static inline Vec8ui add_saturated(Vec8ui const & a, Vec8ui const & b) {
|
|
return Vec8ui(add_saturated(a.get_low(),b.get_low()), add_saturated(a.get_high(),b.get_high()));
|
|
}
|
|
|
|
// function sub_saturated: subtract element by element, unsigned with saturation
|
|
static inline Vec8ui sub_saturated(Vec8ui const & a, Vec8ui const & b) {
|
|
return Vec8ui(sub_saturated(a.get_low(),b.get_low()), sub_saturated(a.get_high(),b.get_high()));
|
|
}
|
|
|
|
// function max: a > b ? a : b
|
|
static inline Vec8ui max(Vec8ui const & a, Vec8ui const & b) {
|
|
return Vec8ui(max(a.get_low(),b.get_low()), max(a.get_high(),b.get_high()));
|
|
}
|
|
|
|
// function min: a < b ? a : b
|
|
static inline Vec8ui min(Vec8ui const & a, Vec8ui const & b) {
|
|
return Vec8ui(min(a.get_low(),b.get_low()), min(a.get_high(),b.get_high()));
|
|
}
|
|
|
|
|
|
|
|
/*****************************************************************************
|
|
*
|
|
* Vector of 4 64-bit signed integers
|
|
*
|
|
*****************************************************************************/
|
|
|
|
class Vec4q : public Vec256b {
|
|
public:
|
|
// Default constructor:
|
|
Vec4q() {
|
|
}
|
|
// Constructor to broadcast the same value into all elements:
|
|
Vec4q(int64_t i) {
|
|
y0 = y1 = Vec2q(i);
|
|
}
|
|
// Constructor to build from all elements:
|
|
Vec4q(int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
|
|
y0 = Vec2q(i0,i1);
|
|
y1 = Vec2q(i2,i3);
|
|
}
|
|
// Constructor to build from two Vec2q:
|
|
Vec4q(Vec2q const & a0, Vec2q const & a1) {
|
|
y0 = a0; y1 = a1;
|
|
}
|
|
// Constructor to convert from type Vec256ie
|
|
Vec4q(Vec256ie const & x) {
|
|
y0 = x.get_low(); y1 = x.get_high();
|
|
}
|
|
// Assignment operator to convert from type Vec256ie
|
|
Vec4q & operator = (Vec256ie const & x) {
|
|
y0 = x.get_low(); y1 = x.get_high();
|
|
return *this;
|
|
}
|
|
// Member function to load from array (unaligned)
|
|
Vec4q & load(void const * p) {
|
|
y0 = _mm_loadu_si128((__m128i const*)p);
|
|
y1 = _mm_loadu_si128((__m128i const*)p + 1);
|
|
return *this;
|
|
}
|
|
// Member function to load from array, aligned by 32
|
|
Vec4q & load_a(void const * p) {
|
|
y0 = _mm_load_si128((__m128i const*)p);
|
|
y1 = _mm_load_si128((__m128i const*)p + 1);
|
|
return *this;
|
|
}
|
|
// Partial load. Load n elements and set the rest to 0
|
|
Vec4q & load_partial(int n, void const * p) {
|
|
if (n <= 0) {
|
|
*this = 0;
|
|
}
|
|
else if (n <= 2) {
|
|
*this = Vec4q(Vec2q().load_partial(n, p), 0);
|
|
}
|
|
else if (n < 4) {
|
|
*this = Vec4q(Vec2q().load(p), Vec2q().load_partial(n-2, (int64_t*)p+2));
|
|
}
|
|
else {
|
|
load(p);
|
|
}
|
|
return *this;
|
|
}
|
|
// Partial store. Store n elements
|
|
void store_partial(int n, void * p) const {
|
|
if (n <= 0) {
|
|
return;
|
|
}
|
|
else if (n <= 2) {
|
|
get_low().store_partial(n, p);
|
|
}
|
|
else if (n < 4) {
|
|
get_low().store(p);
|
|
get_high().store_partial(n-2, (int64_t*)p+2);
|
|
}
|
|
else {
|
|
store(p);
|
|
}
|
|
}
|
|
// cut off vector to n elements. The last 8-n elements are set to zero
|
|
Vec4q & cutoff(int n) {
|
|
*this = Vec32c(*this).cutoff(n * 8);
|
|
return *this;
|
|
}
|
|
// Member function to change a single element in vector
|
|
// Note: This function is inefficient. Use load function if changing more than one element
|
|
Vec4q const & insert(uint32_t index, int64_t value) {
|
|
if (index < 2) {
|
|
y0 = Vec2q(y0).insert(index, value);
|
|
}
|
|
else {
|
|
y1 = Vec2q(y1).insert(index-2, value);
|
|
}
|
|
return *this;
|
|
}
|
|
// Member function extract a single element from vector
|
|
// Note: This function is inefficient. Use store function if extracting more than one element
|
|
int64_t extract(uint32_t index) const {
|
|
if (index < 2) {
|
|
return Vec2q(y0).extract(index);
|
|
}
|
|
else {
|
|
return Vec2q(y1).extract(index-2);
|
|
}
|
|
}
|
|
// Extract a single element. Use store function if extracting more than one element.
|
|
// Operator [] can only read an element, not write.
|
|
int64_t operator [] (uint32_t index) const {
|
|
return extract(index);
|
|
}
|
|
// Member functions to split into two Vec2q:
|
|
Vec2q get_low() const {
|
|
return y0;
|
|
}
|
|
Vec2q get_high() const {
|
|
return y1;
|
|
}
|
|
static int size () {
|
|
return 4;
|
|
}
|
|
};
|
|
|
|
|
|
/*****************************************************************************
|
|
*
|
|
* Vec4qb: Vector of 4 Booleans for use with Vec4q and Vec4uq
|
|
*
|
|
*****************************************************************************/
|
|
|
|
class Vec4qb : public Vec4q {
|
|
public:
|
|
// Default constructor:
|
|
Vec4qb() {
|
|
}
|
|
// Constructor to build from all elements:
|
|
Vec4qb(bool x0, bool x1, bool x2, bool x3) :
|
|
Vec4q(-int64_t(x0), -int64_t(x1), -int64_t(x2), -int64_t(x3)) {
|
|
}
|
|
// Constructor to convert from type Vec256ie
|
|
Vec4qb(Vec256ie const & x) {
|
|
y0 = x.get_low(); y1 = x.get_high();
|
|
}
|
|
// Assignment operator to convert from type Vec256ie
|
|
Vec4qb & operator = (Vec256ie const & x) {
|
|
y0 = x.get_low(); y1 = x.get_high();
|
|
return *this;
|
|
}
|
|
// Constructor to broadcast scalar value:
|
|
Vec4qb(bool b) : Vec4q(-int64_t(b)) {
|
|
}
|
|
// Assignment operator to broadcast scalar value:
|
|
Vec4qb & operator = (bool b) {
|
|
*this = Vec4qb(b);
|
|
return *this;
|
|
}
|
|
private: // Prevent constructing from int, etc.
|
|
Vec4qb(int b);
|
|
Vec4qb & operator = (int x);
|
|
public:
|
|
// Member functions to split into two Vec2qb:
|
|
Vec2qb get_low() const {
|
|
return y0;
|
|
}
|
|
Vec2qb get_high() const {
|
|
return y1;
|
|
}
|
|
Vec4qb & insert (int index, bool a) {
|
|
Vec4q::insert(index, -(int64_t)a);
|
|
return *this;
|
|
};
|
|
// Member function extract a single element from vector
|
|
// Note: This function is inefficient. Use store function if extracting more than one element
|
|
bool extract(uint32_t index) const {
|
|
return Vec4q::extract(index) != 0;
|
|
}
|
|
// Extract a single element. Use store function if extracting more than one element.
|
|
// Operator [] can only read an element, not write.
|
|
bool operator [] (uint32_t index) const {
|
|
return extract(index);
|
|
}
|
|
};
|
|
|
|
|
|
/*****************************************************************************
|
|
*
|
|
* Define operators for Vec4qb
|
|
*
|
|
*****************************************************************************/
|
|
|
|
// vector operator & : bitwise and
|
|
static inline Vec4qb operator & (Vec4qb const & a, Vec4qb const & b) {
|
|
return Vec4qb(Vec256b(a) & Vec256b(b));
|
|
}
|
|
static inline Vec4qb operator && (Vec4qb const & a, Vec4qb const & b) {
|
|
return a & b;
|
|
}
|
|
// vector operator &= : bitwise and
|
|
static inline Vec4qb & operator &= (Vec4qb & a, Vec4qb const & b) {
|
|
a = a & b;
|
|
return a;
|
|
}
|
|
|
|
// vector operator | : bitwise or
|
|
static inline Vec4qb operator | (Vec4qb const & a, Vec4qb const & b) {
|
|
return Vec4qb(Vec256b(a) | Vec256b(b));
|
|
}
|
|
static inline Vec4qb operator || (Vec4qb const & a, Vec4qb const & b) {
|
|
return a | b;
|
|
}
|
|
// vector operator |= : bitwise or
|
|
static inline Vec4qb & operator |= (Vec4qb & a, Vec4qb const & b) {
|
|
a = a | b;
|
|
return a;
|
|
}
|
|
|
|
// vector operator ^ : bitwise xor
|
|
static inline Vec4qb operator ^ (Vec4qb const & a, Vec4qb const & b) {
|
|
return Vec4qb(Vec256b(a) ^ Vec256b(b));
|
|
}
|
|
// vector operator ^= : bitwise xor
|
|
static inline Vec4qb & operator ^= (Vec4qb & a, Vec4qb const & b) {
|
|
a = a ^ b;
|
|
return a;
|
|
}
|
|
|
|
// vector operator ~ : bitwise not
|
|
static inline Vec4qb operator ~ (Vec4qb const & a) {
|
|
return Vec4qb( ~ Vec256b(a));
|
|
}
|
|
|
|
// vector operator ! : element not
|
|
static inline Vec4qb operator ! (Vec4qb const & a) {
|
|
return ~ a;
|
|
}
|
|
|
|
// vector function andnot
|
|
static inline Vec4qb andnot (Vec4qb const & a, Vec4qb const & b) {
|
|
return Vec4qb(andnot(Vec256b(a), Vec256b(b)));
|
|
}
|
|
|
|
|
|
/*****************************************************************************
|
|
*
|
|
* Operators for Vec4q
|
|
*
|
|
*****************************************************************************/
|
|
|
|
// vector operator + : add element by element
|
|
static inline Vec4q operator + (Vec4q const & a, Vec4q const & b) {
|
|
return Vec4q(a.get_low() + b.get_low(), a.get_high() + b.get_high());
|
|
}
|
|
|
|
// vector operator += : add
|
|
static inline Vec4q & operator += (Vec4q & a, Vec4q const & b) {
|
|
a = a + b;
|
|
return a;
|
|
}
|
|
|
|
// postfix operator ++
|
|
static inline Vec4q operator ++ (Vec4q & a, int) {
|
|
Vec4q a0 = a;
|
|
a = a + 1;
|
|
return a0;
|
|
}
|
|
|
|
// prefix operator ++
|
|
static inline Vec4q & operator ++ (Vec4q & a) {
|
|
a = a + 1;
|
|
return a;
|
|
}
|
|
|
|
// vector operator - : subtract element by element
|
|
static inline Vec4q operator - (Vec4q const & a, Vec4q const & b) {
|
|
return Vec4q(a.get_low() - b.get_low(), a.get_high() - b.get_high());
|
|
}
|
|
|
|
// vector operator - : unary minus
|
|
static inline Vec4q operator - (Vec4q const & a) {
|
|
return Vec4q(-a.get_low(), -a.get_high());
|
|
}
|
|
|
|
// vector operator -= : subtract
|
|
static inline Vec4q & operator -= (Vec4q & a, Vec4q const & b) {
|
|
a = a - b;
|
|
return a;
|
|
}
|
|
|
|
// postfix operator --
|
|
static inline Vec4q operator -- (Vec4q & a, int) {
|
|
Vec4q a0 = a;
|
|
a = a - 1;
|
|
return a0;
|
|
}
|
|
|
|
// prefix operator --
|
|
static inline Vec4q & operator -- (Vec4q & a) {
|
|
a = a - 1;
|
|
return a;
|
|
}
|
|
|
|
// vector operator * : multiply element by element
|
|
static inline Vec4q operator * (Vec4q const & a, Vec4q const & b) {
|
|
return Vec4q(a.get_low() * b.get_low(), a.get_high() * b.get_high());
|
|
}
|
|
|
|
// vector operator *= : multiply
|
|
static inline Vec4q & operator *= (Vec4q & a, Vec4q const & b) {
|
|
a = a * b;
|
|
return a;
|
|
}
|
|
|
|
// vector operator << : shift left
|
|
static inline Vec4q operator << (Vec4q const & a, int32_t b) {
|
|
return Vec4q(a.get_low() << b, a.get_high() << b);
|
|
}
|
|
|
|
// vector operator <<= : shift left
|
|
static inline Vec4q & operator <<= (Vec4q & a, int32_t b) {
|
|
a = a << b;
|
|
return a;
|
|
}
|
|
|
|
// vector operator >> : shift right arithmetic
|
|
static inline Vec4q operator >> (Vec4q const & a, int32_t b) {
|
|
return Vec4q(a.get_low() >> b, a.get_high() >> b);
|
|
}
|
|
|
|
// vector operator >>= : shift right arithmetic
|
|
static inline Vec4q & operator >>= (Vec4q & a, int32_t b) {
|
|
a = a >> b;
|
|
return a;
|
|
}
|
|
|
|
// vector operator == : returns true for elements for which a == b
|
|
static inline Vec4qb operator == (Vec4q const & a, Vec4q const & b) {
|
|
return Vec4q(a.get_low() == b.get_low(), a.get_high() == b.get_high());
|
|
}
|
|
|
|
// vector operator != : returns true for elements for which a != b
|
|
static inline Vec4qb operator != (Vec4q const & a, Vec4q const & b) {
|
|
return Vec4q(a.get_low() != b.get_low(), a.get_high() != b.get_high());
|
|
}
|
|
|
|
// vector operator < : returns true for elements for which a < b
|
|
static inline Vec4qb operator < (Vec4q const & a, Vec4q const & b) {
|
|
return Vec4q(a.get_low() < b.get_low(), a.get_high() < b.get_high());
|
|
}
|
|
|
|
// vector operator > : returns true for elements for which a > b
|
|
static inline Vec4qb operator > (Vec4q const & a, Vec4q const & b) {
|
|
return b < a;
|
|
}
|
|
|
|
// vector operator >= : returns true for elements for which a >= b (signed)
|
|
static inline Vec4qb operator >= (Vec4q const & a, Vec4q const & b) {
|
|
return Vec4q(a.get_low() >= b.get_low(), a.get_high() >= b.get_high());
|
|
}
|
|
|
|
// vector operator <= : returns true for elements for which a <= b (signed)
|
|
static inline Vec4qb operator <= (Vec4q const & a, Vec4q const & b) {
|
|
return b >= a;
|
|
}
|
|
|
|
// vector operator & : bitwise and
|
|
static inline Vec4q operator & (Vec4q const & a, Vec4q const & b) {
|
|
return Vec4q(a.get_low() & b.get_low(), a.get_high() & b.get_high());
|
|
}
|
|
static inline Vec4q operator && (Vec4q const & a, Vec4q const & b) {
|
|
return a & b;
|
|
}
|
|
// vector operator &= : bitwise and
|
|
static inline Vec4q & operator &= (Vec4q & a, Vec4q const & b) {
|
|
a = a & b;
|
|
return a;
|
|
}
|
|
|
|
// vector operator | : bitwise or
|
|
static inline Vec4q operator | (Vec4q const & a, Vec4q const & b) {
|
|
return Vec4q(a.get_low() | b.get_low(), a.get_high() | b.get_high());
|
|
}
|
|
static inline Vec4q operator || (Vec4q const & a, Vec4q const & b) {
|
|
return a | b;
|
|
}
|
|
// vector operator |= : bitwise or
|
|
static inline Vec4q & operator |= (Vec4q & a, Vec4q const & b) {
|
|
a = a | b;
|
|
return a;
|
|
}
|
|
|
|
// vector operator ^ : bitwise xor
|
|
static inline Vec4q operator ^ (Vec4q const & a, Vec4q const & b) {
|
|
return Vec4q(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());
|
|
}
|
|
// vector operator ^= : bitwise xor
|
|
static inline Vec4q & operator ^= (Vec4q & a, Vec4q const & b) {
|
|
a = a ^ b;
|
|
return a;
|
|
}
|
|
|
|
|
|
// vector operator ~ : bitwise not
|
|
static inline Vec4q operator ~ (Vec4q const & a) {
|
|
return Vec4q(~a.get_low(), ~a.get_high());
|
|
}
|
|
|
|
// vector operator ! : logical not, returns true for elements == 0
|
|
static inline Vec4qb operator ! (Vec4q const & a) {
|
|
return Vec4q(!a.get_low(), !a.get_high());
|
|
}
|
|
|
|
// Functions for this class
|
|
|
|
// Select between two operands. Corresponds to this pseudocode:
|
|
// for (int i = 0; i < 4; i++) result[i] = s[i] ? a[i] : b[i];
|
|
// Each byte in s must be either 0 (false) or -1 (true). No other values are allowed.
|
|
// (s is signed)
|
|
static inline Vec4q select (Vec4qb const & s, Vec4q const & a, Vec4q const & b) {
|
|
return selectb(s,a,b);
|
|
}
|
|
|
|
// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
|
|
static inline Vec4q if_add (Vec4qb const & f, Vec4q const & a, Vec4q const & b) {
|
|
return a + (Vec4q(f) & b);
|
|
}
|
|
|
|
// Horizontal add: Calculates the sum of all vector elements.
|
|
// Overflow will wrap around
|
|
static inline int64_t horizontal_add (Vec4q const & a) {
|
|
return horizontal_add(a.get_low() + a.get_high());
|
|
}
|
|
|
|
// function max: a > b ? a : b
|
|
static inline Vec4q max(Vec4q const & a, Vec4q const & b) {
|
|
return Vec4q(max(a.get_low(),b.get_low()), max(a.get_high(),b.get_high()));
|
|
}
|
|
|
|
// function min: a < b ? a : b
|
|
static inline Vec4q min(Vec4q const & a, Vec4q const & b) {
|
|
return Vec4q(min(a.get_low(),b.get_low()), min(a.get_high(),b.get_high()));
|
|
}
|
|
|
|
// function abs: a >= 0 ? a : -a
|
|
static inline Vec4q abs(Vec4q const & a) {
|
|
return Vec4q(abs(a.get_low()), abs(a.get_high()));
|
|
}
|
|
|
|
// function abs_saturated: same as abs, saturate if overflow
|
|
static inline Vec4q abs_saturated(Vec4q const & a) {
|
|
return Vec4q(abs_saturated(a.get_low()), abs_saturated(a.get_high()));
|
|
}
|
|
|
|
// function rotate_left all elements
|
|
// Use negative count to rotate right
|
|
static inline Vec4q rotate_left(Vec4q const & a, int b) {
|
|
return Vec4q(rotate_left(a.get_low(),b), rotate_left(a.get_high(),b));
|
|
}
|
|
|
|
|
|
/*****************************************************************************
|
|
*
|
|
* Vector of 4 64-bit unsigned integers
|
|
*
|
|
*****************************************************************************/
|
|
|
|
class Vec4uq : public Vec4q {
|
|
public:
|
|
// Default constructor:
|
|
Vec4uq() {
|
|
};
|
|
// Constructor to broadcast the same value into all elements:
|
|
Vec4uq(uint64_t i) {
|
|
y1 = y0 = Vec2q(i);
|
|
};
|
|
// Constructor to build from all elements:
|
|
Vec4uq(uint64_t i0, uint64_t i1, uint64_t i2, uint64_t i3) {
|
|
y0 = Vec2q(i0, i1);
|
|
y1 = Vec2q(i2, i3);
|
|
};
|
|
// Constructor to build from two Vec2uq:
|
|
Vec4uq(Vec2uq const & a0, Vec2uq const & a1) {
|
|
y0 = a0; y1 = a1;
|
|
}
|
|
// Constructor to convert from type Vec256ie
|
|
Vec4uq(Vec256ie const & x) {
|
|
y0 = x.get_low(); y1 = x.get_high();
|
|
};
|
|
// Assignment operator to convert from type Vec256ie
|
|
Vec4uq & operator = (Vec256ie const & x) {
|
|
y0 = x.get_low(); y1 = x.get_high();
|
|
return *this;
|
|
};
|
|
// Member function to load from array (unaligned)
|
|
Vec4uq & load(void const * p) {
|
|
y0 = _mm_loadu_si128((__m128i const*)p);
|
|
y1 = _mm_loadu_si128((__m128i const*)p + 1);
|
|
return *this;
|
|
}
|
|
// Member function to load from array, aligned by 32
|
|
Vec4uq & load_a(void const * p) {
|
|
y0 = _mm_load_si128((__m128i const*)p);
|
|
y1 = _mm_load_si128((__m128i const*)p + 1);
|
|
return *this;
|
|
}
|
|
// Member function to change a single element in vector
|
|
// Note: This function is inefficient. Use load function if changing more than one element
|
|
Vec4uq const & insert(uint32_t index, uint64_t value) {
|
|
Vec4q::insert(index, value);
|
|
return *this;
|
|
}
|
|
// Member function extract a single element from vector
|
|
uint64_t extract(uint32_t index) const {
|
|
return Vec4q::extract(index);
|
|
}
|
|
// Extract a single element. Use store function if extracting more than one element.
|
|
// Operator [] can only read an element, not write.
|
|
uint64_t operator [] (uint32_t index) const {
|
|
return extract(index);
|
|
}
|
|
// Member functions to split into two Vec2uq:
|
|
Vec2uq get_low() const {
|
|
return y0;
|
|
}
|
|
Vec2uq get_high() const {
|
|
return y1;
|
|
}
|
|
};
|
|
|
|
// Define operators for this class
|
|
|
|
// vector operator + : add
|
|
static inline Vec4uq operator + (Vec4uq const & a, Vec4uq const & b) {
|
|
return Vec4uq (Vec4q(a) + Vec4q(b));
|
|
}
|
|
|
|
// vector operator - : subtract
|
|
static inline Vec4uq operator - (Vec4uq const & a, Vec4uq const & b) {
|
|
return Vec4uq (Vec4q(a) - Vec4q(b));
|
|
}
|
|
|
|
// vector operator * : multiply element by element
|
|
static inline Vec4uq operator * (Vec4uq const & a, Vec4uq const & b) {
|
|
return Vec4uq (Vec4q(a) * Vec4q(b));
|
|
}
|
|
|
|
// vector operator >> : shift right logical all elements
|
|
static inline Vec4uq operator >> (Vec4uq const & a, uint32_t b) {
|
|
return Vec4uq(a.get_low() >> b, a.get_high() >> b);
|
|
}
|
|
|
|
// vector operator >> : shift right logical all elements
|
|
static inline Vec4uq operator >> (Vec4uq const & a, int32_t b) {
|
|
return a >> (uint32_t)b;
|
|
}
|
|
|
|
// vector operator >>= : shift right artihmetic
|
|
static inline Vec4uq & operator >>= (Vec4uq & a, uint32_t b) {
|
|
a = a >> b;
|
|
return a;
|
|
}
|
|
|
|
// vector operator << : shift left all elements
|
|
static inline Vec4uq operator << (Vec4uq const & a, uint32_t b) {
|
|
return Vec4uq ((Vec4q)a << (int32_t)b);
|
|
}
|
|
|
|
// vector operator << : shift left all elements
|
|
static inline Vec4uq operator << (Vec4uq const & a, int32_t b) {
|
|
return Vec4uq ((Vec4q)a << b);
|
|
}
|
|
|
|
// vector operator > : returns true for elements for which a > b (unsigned)
|
|
static inline Vec4qb operator > (Vec4uq const & a, Vec4uq const & b) {
|
|
return Vec4q(a.get_low() > b.get_low(), a.get_high() > b.get_high());
|
|
}
|
|
|
|
// vector operator < : returns true for elements for which a < b (unsigned)
|
|
static inline Vec4qb operator < (Vec4uq const & a, Vec4uq const & b) {
|
|
return b > a;
|
|
}
|
|
|
|
// vector operator >= : returns true for elements for which a >= b (unsigned)
|
|
static inline Vec4qb operator >= (Vec4uq const & a, Vec4uq const & b) {
|
|
return Vec4q(a.get_low() >= b.get_low(), a.get_high() >= b.get_high());
|
|
}
|
|
|
|
// vector operator <= : returns true for elements for which a <= b (unsigned)
|
|
static inline Vec4qb operator <= (Vec4uq const & a, Vec4uq const & b) {
|
|
return b >= a;
|
|
}
|
|
|
|
// vector operator & : bitwise and
|
|
static inline Vec4uq operator & (Vec4uq const & a, Vec4uq const & b) {
|
|
return Vec4uq(a.get_low() & b.get_low(), a.get_high() & b.get_high());
|
|
}
|
|
static inline Vec4uq operator && (Vec4uq const & a, Vec4uq const & b) {
|
|
return a & b;
|
|
}
|
|
|
|
// vector operator | : bitwise or
|
|
static inline Vec4uq operator | (Vec4uq const & a, Vec4uq const & b) {
|
|
return Vec4q(a.get_low() | b.get_low(), a.get_high() | b.get_high());
|
|
}
|
|
static inline Vec4uq operator || (Vec4uq const & a, Vec4uq const & b) {
|
|
return a | b;
|
|
}
|
|
|
|
// vector operator ^ : bitwise xor
|
|
static inline Vec4uq operator ^ (Vec4uq const & a, Vec4uq const & b) {
|
|
return Vec4uq(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());
|
|
}
|
|
|
|
// vector operator ~ : bitwise not
|
|
static inline Vec4uq operator ~ (Vec4uq const & a) {
|
|
return Vec4uq(~a.get_low(), ~a.get_high());
|
|
}
|
|
|
|
// Functions for this class
|
|
|
|
// Select between two operands. Corresponds to this pseudocode:
|
|
// for (int i = 0; i < 4; i++) result[i] = s[i] ? a[i] : b[i];
|
|
// Each word in s must be either 0 (false) or -1 (true). No other values are allowed.
|
|
// (s is signed)
|
|
static inline Vec4uq select (Vec4qb const & s, Vec4uq const & a, Vec4uq const & b) {
|
|
return selectb(s,a,b);
|
|
}
|
|
|
|
// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
|
|
static inline Vec4uq if_add (Vec4qb const & f, Vec4uq const & a, Vec4uq const & b) {
|
|
return a + (Vec4uq(f) & b);
|
|
}
|
|
|
|
// Horizontal add: Calculates the sum of all vector elements.
|
|
// Overflow will wrap around
|
|
static inline uint64_t horizontal_add (Vec4uq const & a) {
|
|
return horizontal_add((Vec4q)a);
|
|
}
|
|
|
|
// function max: a > b ? a : b
|
|
static inline Vec4uq max(Vec4uq const & a, Vec4uq const & b) {
|
|
return Vec4uq(max(a.get_low(),b.get_low()), max(a.get_high(),b.get_high()));
|
|
}
|
|
|
|
// function min: a < b ? a : b
|
|
static inline Vec4uq min(Vec4uq const & a, Vec4uq const & b) {
|
|
return Vec4uq(min(a.get_low(),b.get_low()), min(a.get_high(),b.get_high()));
|
|
}
|
|
|
|
|
|
/*****************************************************************************
|
|
*
|
|
* Vector permute functions
|
|
*
|
|
******************************************************************************
|
|
*
|
|
* These permute functions can reorder the elements of a vector and optionally
|
|
* set some elements to zero.
|
|
*
|
|
* The indexes are inserted as template parameters in <>. These indexes must be
|
|
* constants. Each template parameter is an index to the element you want to select.
|
|
* An index of -1 will generate zero. An index of -256 means don't care.
|
|
*
|
|
* Example:
|
|
|
|
* Vec8i a(10,11,12,13,14,15,16,17); // a is (10,11,12,13,14,15,16,17)
|
|
* Vec8i b;
|
|
* b = permute8i<0,2,7,7,-1,-1,1,1>(a); // b is (10,12,17,17, 0, 0,11,11)
|
|
*
|
|
* A lot of the code here is metaprogramming aiming to find the instructions
|
|
* that best fit the template parameters and instruction set. The metacode
|
|
* will be reduced out to leave only a few vector instructions in release
|
|
* mode with optimization on.
|
|
*****************************************************************************/
|
|
|
|
|
|
// Shuffle vector of 4 64-bit integers.
|
|
// Index -1 gives 0, index -256 means don't care.
|
|
template <int i0, int i1, int i2, int i3 >
|
|
static inline Vec4q permute4q(Vec4q const & a) {
|
|
return Vec4q(blend2q<i0,i1> (a.get_low(), a.get_high()),
|
|
blend2q<i2,i3> (a.get_low(), a.get_high()));
|
|
}
|
|
|
|
template <int i0, int i1, int i2, int i3>
|
|
static inline Vec4uq permute4uq(Vec4uq const & a) {
|
|
return Vec4uq (permute4q<i0,i1,i2,i3> (a));
|
|
}
|
|
|
|
// Shuffle vector of 8 32-bit integers.
|
|
// Index -1 gives 0, index -256 means don't care.
|
|
template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7 >
|
|
static inline Vec8i permute8i(Vec8i const & a) {
|
|
return Vec8i(blend4i<i0,i1,i2,i3> (a.get_low(), a.get_high()),
|
|
blend4i<i4,i5,i6,i7> (a.get_low(), a.get_high()));
|
|
}
|
|
|
|
template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7 >
|
|
static inline Vec8ui permute8ui(Vec8ui const & a) {
|
|
return Vec8ui (permute8i<i0,i1,i2,i3,i4,i5,i6,i7> (a));
|
|
}
|
|
|
|
// Shuffle vector of 16 16-bit integers.
|
|
// Index -1 gives 0, index -256 means don't care.
|
|
template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7,
|
|
int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15 >
|
|
static inline Vec16s permute16s(Vec16s const & a) {
|
|
return Vec16s(blend8s<i0,i1,i2 ,i3 ,i4 ,i5 ,i6 ,i7 > (a.get_low(), a.get_high()),
|
|
blend8s<i8,i9,i10,i11,i12,i13,i14,i15> (a.get_low(), a.get_high()));
|
|
}
|
|
|
|
template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7,
|
|
int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15 >
|
|
static inline Vec16us permute16us(Vec16us const & a) {
|
|
return Vec16us (permute16s<i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15> (a));
|
|
}
|
|
|
|
template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7,
|
|
int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15,
|
|
int i16, int i17, int i18, int i19, int i20, int i21, int i22, int i23,
|
|
int i24, int i25, int i26, int i27, int i28, int i29, int i30, int i31 >
|
|
static inline Vec32c permute32c(Vec32c const & a) {
|
|
return Vec32c(blend16c<i0, i1, i2 ,i3 ,i4 ,i5 ,i6 ,i7, i8, i9, i10,i11,i12,i13,i14,i15> (a.get_low(), a.get_high()),
|
|
blend16c<i16,i17,i18,i19,i20,i21,i22,i23,i24,i25,i26,i27,i28,i29,i30,i31> (a.get_low(), a.get_high()));
|
|
}
|
|
|
|
template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7,
|
|
int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15,
|
|
int i16, int i17, int i18, int i19, int i20, int i21, int i22, int i23,
|
|
int i24, int i25, int i26, int i27, int i28, int i29, int i30, int i31 >
|
|
static inline Vec32uc permute32uc(Vec32uc const & a) {
|
|
return Vec32uc (permute32c<i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15,
|
|
i16,i17,i18,i19,i20,i21,i22,i23,i24,i25,i26,i27,i28,i29,i30,i31> (a));
|
|
}
|
|
|
|
|
|
/*****************************************************************************
|
|
*
|
|
* Vector blend functions
|
|
*
|
|
******************************************************************************
|
|
*
|
|
* These blend functions can mix elements from two different vectors and
|
|
* optionally set some elements to zero.
|
|
*
|
|
* The indexes are inserted as template parameters in <>. These indexes must be
|
|
* constants. Each template parameter is an index to the element you want to
|
|
* select, where higher indexes indicate an element from the second source
|
|
* vector. For example, if each vector has 8 elements, then indexes 0 - 7
|
|
* will select an element from the first vector and indexes 8 - 15 will select
|
|
* an element from the second vector. A negative index will generate zero.
|
|
*
|
|
* Example:
|
|
* Vec8i a(100,101,102,103,104,105,106,107); // a is (100, 101, 102, 103, 104, 105, 106, 107)
|
|
* Vec8i b(200,201,202,203,204,205,206,207); // b is (200, 201, 202, 203, 204, 205, 206, 207)
|
|
* Vec8i c;
|
|
* c = blend8i<1,0,9,8,7,-1,15,15> (a,b); // c is (101, 100, 201, 200, 107, 0, 207, 207)
|
|
*
|
|
* A lot of the code here is metaprogramming aiming to find the instructions
|
|
* that best fit the template parameters and instruction set. The metacode
|
|
* will be reduced out to leave only a few vector instructions in release
|
|
* mode with optimization on.
|
|
*****************************************************************************/
|
|
|
|
// helper function used below
|
|
template <int n>
|
|
static inline Vec2q select4(Vec4q const & a, Vec4q const & b) {
|
|
switch (n) {
|
|
case 0:
|
|
return a.get_low();
|
|
case 1:
|
|
return a.get_high();
|
|
case 2:
|
|
return b.get_low();
|
|
case 3:
|
|
return b.get_high();
|
|
}
|
|
return _mm_setzero_si128();
|
|
}
|
|
|
|
// blend vectors Vec4q
|
|
template <int i0, int i1, int i2, int i3>
|
|
static inline Vec4q blend4q(Vec4q const & a, Vec4q const & b) {
|
|
const int j0 = i0 >= 0 ? i0/2 : i0;
|
|
const int j1 = i1 >= 0 ? i1/2 : i1;
|
|
const int j2 = i2 >= 0 ? i2/2 : i2;
|
|
const int j3 = i3 >= 0 ? i3/2 : i3;
|
|
Vec2q x0, x1;
|
|
|
|
if (j0 == j1 || i0 < 0 || i1 < 0) { // both from same
|
|
const int k0 = j0 >= 0 ? j0 : j1;
|
|
x0 = permute2q<i0 & -7, i1 & -7> (select4<k0> (a,b));
|
|
}
|
|
else {
|
|
x0 = blend2q<i0 & -7, (i1 & -7) | 2> (select4<j0>(a,b), select4<j1>(a,b));
|
|
}
|
|
if (j2 == j3 || i2 < 0 || i3 < 0) { // both from same
|
|
const int k1 = j2 >= 0 ? j2 : j3;
|
|
x1 = permute2q<i2 & -7, i3 & -7> (select4<k1> (a,b));
|
|
}
|
|
else {
|
|
x1 = blend2q<i2 & -7, (i3 & -7) | 2> (select4<j2>(a,b), select4<j3>(a,b));
|
|
}
|
|
return Vec4q(x0,x1);
|
|
}
|
|
|
|
template <int i0, int i1, int i2, int i3>
|
|
static inline Vec4uq blend4uq(Vec4uq const & a, Vec4uq const & b) {
|
|
return Vec4uq( blend4q<i0,i1,i2,i3> (a,b));
|
|
}
|
|
|
|
// helper function used below
|
|
template <int n>
|
|
static inline Vec4i select4(Vec8i const & a, Vec8i const & b) {
|
|
switch (n) {
|
|
case 0:
|
|
return a.get_low();
|
|
case 1:
|
|
return a.get_high();
|
|
case 2:
|
|
return b.get_low();
|
|
case 3:
|
|
return b.get_high();
|
|
}
|
|
return _mm_setzero_si128();
|
|
}
|
|
|
|
// blend vectors Vec8i
|
|
template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
|
|
static inline Vec8i blend8i(Vec8i const & a, Vec8i const & b) {
|
|
const int j0 = i0 >= 0 ? i0/4 : i0;
|
|
const int j1 = i1 >= 0 ? i1/4 : i1;
|
|
const int j2 = i2 >= 0 ? i2/4 : i2;
|
|
const int j3 = i3 >= 0 ? i3/4 : i3;
|
|
const int j4 = i4 >= 0 ? i4/4 : i4;
|
|
const int j5 = i5 >= 0 ? i5/4 : i5;
|
|
const int j6 = i6 >= 0 ? i6/4 : i6;
|
|
const int j7 = i7 >= 0 ? i7/4 : i7;
|
|
Vec4i x0, x1;
|
|
|
|
const int r0 = j0 >= 0 ? j0 : j1 >= 0 ? j1 : j2 >= 0 ? j2 : j3;
|
|
const int r1 = j4 >= 0 ? j4 : j5 >= 0 ? j5 : j6 >= 0 ? j6 : j7;
|
|
const int s0 = (j1 >= 0 && j1 != r0) ? j1 : (j2 >= 0 && j2 != r0) ? j2 : j3;
|
|
const int s1 = (j5 >= 0 && j5 != r1) ? j5 : (j6 >= 0 && j6 != r1) ? j6 : j7;
|
|
|
|
// Combine all the indexes into a single bitfield, with 4 bits for each
|
|
const int m1 = (i0&0xF) | (i1&0xF)<<4 | (i2&0xF)<<8 | (i3&0xF)<<12 | (i4&0xF)<<16 | (i5&0xF)<<20 | (i6&0xF)<<24 | (i7&0xF)<<28;
|
|
|
|
// Mask to zero out negative indexes
|
|
const int mz = (i0<0?0:0xF) | (i1<0?0:0xF)<<4 | (i2<0?0:0xF)<<8 | (i3<0?0:0xF)<<12 | (i4<0?0:0xF)<<16 | (i5<0?0:0xF)<<20 | (i6<0?0:0xF)<<24 | (i7<0?0:0xF)<<28;
|
|
|
|
if (r0 < 0) {
|
|
x0 = _mm_setzero_si128();
|
|
}
|
|
else if (((m1 ^ r0*0x4444) & 0xCCCC & mz) == 0) {
|
|
// i0 - i3 all from same source
|
|
x0 = permute4i<i0 & -13, i1 & -13, i2 & -13, i3 & -13> (select4<r0> (a,b));
|
|
}
|
|
else if ((j2 < 0 || j2 == r0 || j2 == s0) && (j3 < 0 || j3 == r0 || j3 == s0)) {
|
|
// i0 - i3 all from two sources
|
|
const int k0 = i0 >= 0 ? i0 & 3 : i0;
|
|
const int k1 = (i1 >= 0 ? i1 & 3 : i1) | (j1 == s0 ? 4 : 0);
|
|
const int k2 = (i2 >= 0 ? i2 & 3 : i2) | (j2 == s0 ? 4 : 0);
|
|
const int k3 = (i3 >= 0 ? i3 & 3 : i3) | (j3 == s0 ? 4 : 0);
|
|
x0 = blend4i<k0,k1,k2,k3> (select4<r0>(a,b), select4<s0>(a,b));
|
|
}
|
|
else {
|
|
// i0 - i3 from three or four different sources
|
|
x0 = blend4i<0,1,6,7> (
|
|
blend4i<i0 & -13, (i1 & -13) | 4, -0x100, -0x100> (select4<j0>(a,b), select4<j1>(a,b)),
|
|
blend4i<-0x100, -0x100, i2 & -13, (i3 & -13) | 4> (select4<j2>(a,b), select4<j3>(a,b)));
|
|
}
|
|
|
|
if (r1 < 0) {
|
|
x1 = _mm_setzero_si128();
|
|
}
|
|
else if (((m1 ^ uint32_t(r1)*0x44440000u) & 0xCCCC0000 & mz) == 0) {
|
|
// i4 - i7 all from same source
|
|
x1 = permute4i<i4 & -13, i5 & -13, i6 & -13, i7 & -13> (select4<r1> (a,b));
|
|
}
|
|
else if ((j6 < 0 || j6 == r1 || j6 == s1) && (j7 < 0 || j7 == r1 || j7 == s1)) {
|
|
// i4 - i7 all from two sources
|
|
const int k4 = i4 >= 0 ? i4 & 3 : i4;
|
|
const int k5 = (i5 >= 0 ? i5 & 3 : i5) | (j5 == s1 ? 4 : 0);
|
|
const int k6 = (i6 >= 0 ? i6 & 3 : i6) | (j6 == s1 ? 4 : 0);
|
|
const int k7 = (i7 >= 0 ? i7 & 3 : i7) | (j7 == s1 ? 4 : 0);
|
|
x1 = blend4i<k4,k5,k6,k7> (select4<r1>(a,b), select4<s1>(a,b));
|
|
}
|
|
else {
|
|
// i4 - i7 from three or four different sources
|
|
x1 = blend4i<0,1,6,7> (
|
|
blend4i<i4 & -13, (i5 & -13) | 4, -0x100, -0x100> (select4<j4>(a,b), select4<j5>(a,b)),
|
|
blend4i<-0x100, -0x100, i6 & -13, (i7 & -13) | 4> (select4<j6>(a,b), select4<j7>(a,b)));
|
|
}
|
|
|
|
return Vec8i(x0,x1);
|
|
}
|
|
|
|
template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
|
|
static inline Vec8ui blend8ui(Vec8ui const & a, Vec8ui const & b) {
|
|
return Vec8ui( blend8i<i0,i1,i2,i3,i4,i5,i6,i7> (a,b));
|
|
}
|
|
|
|
// helper function used below
|
|
template <int n>
|
|
static inline Vec8s select4(Vec16s const & a, Vec16s const & b) {
|
|
switch (n) {
|
|
case 0:
|
|
return a.get_low();
|
|
case 1:
|
|
return a.get_high();
|
|
case 2:
|
|
return b.get_low();
|
|
case 3:
|
|
return b.get_high();
|
|
}
|
|
return _mm_setzero_si128();
|
|
}
|
|
|
|
template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7,
|
|
int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15 >
|
|
static inline Vec16s blend16s(Vec16s const & a, Vec16s const & b) {
|
|
|
|
const int j0 = i0 >= 0 ? i0 /8 : i0;
|
|
const int j1 = i1 >= 0 ? i1 /8 : i1;
|
|
const int j2 = i2 >= 0 ? i2 /8 : i2;
|
|
const int j3 = i3 >= 0 ? i3 /8 : i3;
|
|
const int j4 = i4 >= 0 ? i4 /8 : i4;
|
|
const int j5 = i5 >= 0 ? i5 /8 : i5;
|
|
const int j6 = i6 >= 0 ? i6 /8 : i6;
|
|
const int j7 = i7 >= 0 ? i7 /8 : i7;
|
|
const int j8 = i8 >= 0 ? i8 /8 : i8;
|
|
const int j9 = i9 >= 0 ? i9 /8 : i9;
|
|
const int j10 = i10 >= 0 ? i10/8 : i10;
|
|
const int j11 = i11 >= 0 ? i11/8 : i11;
|
|
const int j12 = i12 >= 0 ? i12/8 : i12;
|
|
const int j13 = i13 >= 0 ? i13/8 : i13;
|
|
const int j14 = i14 >= 0 ? i14/8 : i14;
|
|
const int j15 = i15 >= 0 ? i15/8 : i15;
|
|
|
|
Vec8s x0, x1;
|
|
|
|
const int r0 = j0 >= 0 ? j0 : j1 >= 0 ? j1 : j2 >= 0 ? j2 : j3 >= 0 ? j3 : j4 >= 0 ? j4 : j5 >= 0 ? j5 : j6 >= 0 ? j6 : j7;
|
|
const int r1 = j8 >= 0 ? j8 : j9 >= 0 ? j9 : j10 >= 0 ? j10 : j11 >= 0 ? j11 : j12 >= 0 ? j12 : j13 >= 0 ? j13 : j14 >= 0 ? j14 : j15;
|
|
const int s0 = (j1 >= 0 && j1 != r0) ? j1 : (j2 >= 0 && j2 != r0) ? j2 : (j3 >= 0 && j3 != r0) ? j3 : (j4 >= 0 && j4 != r0) ? j4 : (j5 >= 0 && j5 != r0) ? j5 : (j6 >= 0 && j6 != r0) ? j6 : j7;
|
|
const int s1 = (j9 >= 0 && j9 != r1) ? j9 : (j10>= 0 && j10!= r1) ? j10 : (j11>= 0 && j11!= r1) ? j11: (j12>= 0 && j12!= r1) ? j12: (j13>= 0 && j13!= r1) ? j13: (j14>= 0 && j14!= r1) ? j14: j15;
|
|
|
|
if (r0 < 0) {
|
|
x0 = _mm_setzero_si128();
|
|
}
|
|
else if (r0 == s0) {
|
|
// i0 - i7 all from same source
|
|
x0 = permute8s<i0&-25, i1&-25, i2&-25, i3&-25, i4&-25, i5&-25, i6&-25, i7&-25> (select4<r0> (a,b));
|
|
}
|
|
else if ((j2<0||j2==r0||j2==s0) && (j3<0||j3==r0||j3 == s0) && (j4<0||j4==r0||j4 == s0) && (j5<0||j5==r0||j5 == s0) && (j6<0||j6==r0||j6 == s0) && (j7<0||j7==r0||j7 == s0)) {
|
|
// i0 - i7 all from two sources
|
|
const int k0 = i0 >= 0 ? i0 & 7 : i0;
|
|
const int k1 = (i1 >= 0 ? i1 & 7 : i1) | (j1 == s0 ? 8 : 0);
|
|
const int k2 = (i2 >= 0 ? i2 & 7 : i2) | (j2 == s0 ? 8 : 0);
|
|
const int k3 = (i3 >= 0 ? i3 & 7 : i3) | (j3 == s0 ? 8 : 0);
|
|
const int k4 = (i4 >= 0 ? i4 & 7 : i4) | (j4 == s0 ? 8 : 0);
|
|
const int k5 = (i5 >= 0 ? i5 & 7 : i5) | (j5 == s0 ? 8 : 0);
|
|
const int k6 = (i6 >= 0 ? i6 & 7 : i6) | (j6 == s0 ? 8 : 0);
|
|
const int k7 = (i7 >= 0 ? i7 & 7 : i7) | (j7 == s0 ? 8 : 0);
|
|
x0 = blend8s<k0,k1,k2,k3,k4,k5,k6,k7> (select4<r0>(a,b), select4<s0>(a,b));
|
|
}
|
|
else {
|
|
// i0 - i7 from three or four different sources
|
|
const int n0 = j0 >= 0 ? j0 /2*8 + 0 : j0;
|
|
const int n1 = j1 >= 0 ? j1 /2*8 + 1 : j1;
|
|
const int n2 = j2 >= 0 ? j2 /2*8 + 2 : j2;
|
|
const int n3 = j3 >= 0 ? j3 /2*8 + 3 : j3;
|
|
const int n4 = j4 >= 0 ? j4 /2*8 + 4 : j4;
|
|
const int n5 = j5 >= 0 ? j5 /2*8 + 5 : j5;
|
|
const int n6 = j6 >= 0 ? j6 /2*8 + 6 : j6;
|
|
const int n7 = j7 >= 0 ? j7 /2*8 + 7 : j7;
|
|
x0 = blend8s<n0, n1, n2, n3, n4, n5, n6, n7> (
|
|
blend8s< j0 & 2 ? -256 : i0 &15, j1 & 2 ? -256 : i1 &15, j2 & 2 ? -256 : i2 &15, j3 & 2 ? -256 : i3 &15, j4 & 2 ? -256 : i4 &15, j5 & 2 ? -256 : i5 &15, j6 & 2 ? -256 : i6 &15, j7 & 2 ? -256 : i7 &15> (a.get_low(),a.get_high()),
|
|
blend8s<(j0^2)& 6 ? -256 : i0 &15, (j1^2)& 6 ? -256 : i1 &15, (j2^2)& 6 ? -256 : i2 &15, (j3^2)& 6 ? -256 : i3 &15, (j4^2)& 6 ? -256 : i4 &15, (j5^2)& 6 ? -256 : i5 &15, (j6^2)& 6 ? -256 : i6 &15, (j7^2)& 6 ? -256 : i7 &15> (b.get_low(),b.get_high()));
|
|
}
|
|
|
|
if (r1 < 0) {
|
|
x1 = _mm_setzero_si128();
|
|
}
|
|
else if (r1 == s1) {
|
|
// i8 - i15 all from same source
|
|
x1 = permute8s<i8&-25, i9&-25, i10&-25, i11&-25, i12&-25, i13&-25, i14&-25, i15&-25> (select4<r1> (a,b));
|
|
}
|
|
else if ((j10<0||j10==r1||j10==s1) && (j11<0||j11==r1||j11==s1) && (j12<0||j12==r1||j12==s1) && (j13<0||j13==r1||j13==s1) && (j14<0||j14==r1||j14==s1) && (j15<0||j15==r1||j15==s1)) {
|
|
// i8 - i15 all from two sources
|
|
const int k8 = i8 >= 0 ? i8 & 7 : i8;
|
|
const int k9 = (i9 >= 0 ? i9 & 7 : i9 ) | (j9 == s1 ? 8 : 0);
|
|
const int k10= (i10>= 0 ? i10& 7 : i10) | (j10== s1 ? 8 : 0);
|
|
const int k11= (i11>= 0 ? i11& 7 : i11) | (j11== s1 ? 8 : 0);
|
|
const int k12= (i12>= 0 ? i12& 7 : i12) | (j12== s1 ? 8 : 0);
|
|
const int k13= (i13>= 0 ? i13& 7 : i13) | (j13== s1 ? 8 : 0);
|
|
const int k14= (i14>= 0 ? i14& 7 : i14) | (j14== s1 ? 8 : 0);
|
|
const int k15= (i15>= 0 ? i15& 7 : i15) | (j15== s1 ? 8 : 0);
|
|
x1 = blend8s<k8,k9,k10,k11,k12,k13,k14,k15> (select4<r1>(a,b), select4<s1>(a,b));
|
|
}
|
|
else {
|
|
// i8 - i15 from three or four different sources
|
|
const int n8 = j8 >= 0 ? j8 /2*8 + 0 : j8 ;
|
|
const int n9 = j9 >= 0 ? j9 /2*8 + 1 : j9 ;
|
|
const int n10= j10>= 0 ? j10/2*8 + 2 : j10;
|
|
const int n11= j11>= 0 ? j11/2*8 + 3 : j11;
|
|
const int n12= j12>= 0 ? j12/2*8 + 4 : j12;
|
|
const int n13= j13>= 0 ? j13/2*8 + 5 : j13;
|
|
const int n14= j14>= 0 ? j14/2*8 + 6 : j14;
|
|
const int n15= j15>= 0 ? j15/2*8 + 7 : j15;
|
|
x1 = blend8s<n8, n9, n10, n11, n12, n13, n14, n15> (
|
|
blend8s< j8 & 2 ? -256 : i8 &15, j9 & 2 ? -256 : i9 &15, j10 & 2 ? -256 : i10 &15, j11 & 2 ? -256 : i11 &15, j12 & 2 ? -256 : i12 &15, j13 & 2 ? -256 : i13 &15, j14 & 2 ? -256 : i14 &15, j15 & 2 ? -256 : i15 &15> (a.get_low(),a.get_high()),
|
|
blend8s<(j8^2)& 6 ? -256 : i8 &15, (j9^2)& 6 ? -256 : i9 &15, (j10^2)& 6 ? -256 : i10 &15, (j11^2)& 6 ? -256 : i11 &15, (j12^2)& 6 ? -256 : i12 &15, (j13^2)& 6 ? -256 : i13 &15, (j14^2)& 6 ? -256 : i14 &15, (j15^2)& 6 ? -256 : i15 &15> (b.get_low(),b.get_high()));
|
|
}
|
|
return Vec16s(x0,x1);
|
|
}
|
|
|
|
template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7,
|
|
int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15 >
|
|
static inline Vec16us blend16us(Vec16us const & a, Vec16us const & b) {
|
|
return Vec16us( blend16s<i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15> (a,b));
|
|
}
|
|
|
|
// helper function used below
|
|
template <int n>
|
|
static inline Vec16c select4(Vec32c const & a, Vec32c const & b) {
|
|
switch (n) {
|
|
case 0:
|
|
return a.get_low();
|
|
case 1:
|
|
return a.get_high();
|
|
case 2:
|
|
return b.get_low();
|
|
case 3:
|
|
return b.get_high();
|
|
}
|
|
return _mm_setzero_si128();
|
|
}
|
|
|
|
template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7,
|
|
int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15,
|
|
int i16, int i17, int i18, int i19, int i20, int i21, int i22, int i23,
|
|
int i24, int i25, int i26, int i27, int i28, int i29, int i30, int i31 >
|
|
static inline Vec32c blend32c(Vec32c const & a, Vec32c const & b) {
|
|
|
|
// j0 - j31 indicate one of four 16-byte sources
|
|
const int j0 = i0 >= 0 ? i0 /16 : i0;
|
|
const int j1 = i1 >= 0 ? i1 /16 : i1;
|
|
const int j2 = i2 >= 0 ? i2 /16 : i2;
|
|
const int j3 = i3 >= 0 ? i3 /16 : i3;
|
|
const int j4 = i4 >= 0 ? i4 /16 : i4;
|
|
const int j5 = i5 >= 0 ? i5 /16 : i5;
|
|
const int j6 = i6 >= 0 ? i6 /16 : i6;
|
|
const int j7 = i7 >= 0 ? i7 /16 : i7;
|
|
const int j8 = i8 >= 0 ? i8 /16 : i8;
|
|
const int j9 = i9 >= 0 ? i9 /16 : i9;
|
|
const int j10 = i10 >= 0 ? i10/16 : i10;
|
|
const int j11 = i11 >= 0 ? i11/16 : i11;
|
|
const int j12 = i12 >= 0 ? i12/16 : i12;
|
|
const int j13 = i13 >= 0 ? i13/16 : i13;
|
|
const int j14 = i14 >= 0 ? i14/16 : i14;
|
|
const int j15 = i15 >= 0 ? i15/16 : i15;
|
|
const int j16 = i16 >= 0 ? i16/16 : i16;
|
|
const int j17 = i17 >= 0 ? i17/16 : i17;
|
|
const int j18 = i18 >= 0 ? i18/16 : i18;
|
|
const int j19 = i19 >= 0 ? i19/16 : i19;
|
|
const int j20 = i20 >= 0 ? i20/16 : i20;
|
|
const int j21 = i21 >= 0 ? i21/16 : i21;
|
|
const int j22 = i22 >= 0 ? i22/16 : i22;
|
|
const int j23 = i23 >= 0 ? i23/16 : i23;
|
|
const int j24 = i24 >= 0 ? i24/16 : i24;
|
|
const int j25 = i25 >= 0 ? i25/16 : i25;
|
|
const int j26 = i26 >= 0 ? i26/16 : i26;
|
|
const int j27 = i27 >= 0 ? i27/16 : i27;
|
|
const int j28 = i28 >= 0 ? i28/16 : i28;
|
|
const int j29 = i29 >= 0 ? i29/16 : i29;
|
|
const int j30 = i30 >= 0 ? i30/16 : i30;
|
|
const int j31 = i31 >= 0 ? i31/16 : i31;
|
|
|
|
Vec16c x0, x1;
|
|
|
|
// r0, s0 = first two sources of low destination (i0 - i15)
|
|
// r1, s1 = first two sources of high destination (i16 - i31)
|
|
const int r0 = j0 >= 0 ? j0 : j1 >= 0 ? j1 : j2 >= 0 ? j2 : j3 >= 0 ? j3 : j4 >= 0 ? j4 : j5 >= 0 ? j5 : j6 >= 0 ? j6 : j7 >= 0 ? j7 :
|
|
j8 >= 0 ? j8 : j9 >= 0 ? j9 : j10 >= 0 ? j10 : j11 >= 0 ? j11 : j12 >= 0 ? j12 : j13 >= 0 ? j13 : j14 >= 0 ? j14 : j15;
|
|
const int r1 = j16>= 0 ? j16: j17>= 0 ? j17: j18 >= 0 ? j18 : j19 >= 0 ? j19 : j20 >= 0 ? j20 : j21 >= 0 ? j21 : j22 >= 0 ? j22 : j23>= 0 ? j23:
|
|
j24>= 0 ? j24: j25>= 0 ? j25: j26 >= 0 ? j26 : j27 >= 0 ? j27 : j28 >= 0 ? j28 : j29 >= 0 ? j29 : j30 >= 0 ? j30 : j31;
|
|
const int s0 = (j1 >=0&&j1 !=r0)?j1 : (j2 >=0&&j2 !=r0)?j2 : (j3 >=0&&j3 !=r0)?j3 : (j4 >=0&&j4 !=r0)?j4 : (j5 >=0&&j5 !=r0)?j5 : (j6 >=0&&j6 !=r0)?j6 : (j7 >=0&&j7 !=r0)?j7 :
|
|
(j8 >=0&&j8 !=r0)?j8 : (j9 >=0&&j9 !=r0)?j9 : (j10>=0&&j10!=r0)?j10 : (j11>=0&&j11!=r0)?j11 : (j12>=0&&j12!=r0)?j12 : (j13>=0&&j13!=r0)?j13: (j14>=0&&j14!=r0)?j14: j15;
|
|
const int s1 = (j17>=0&&j17!=r1)?j17 : (j18>=0&&j18!=r1)?j18 : (j19>=0&&j19!=r1)?j19 : (j20>=0&&j20!=r1)?j20 : (j21>=0&&j21!=r1)?j21 : (j22>=0&&j22!=r1)?j22: (j23>=0&&j23!=r1)?j23:
|
|
(j24>=0&&j24!=r1)?j24 : (j25>=0&&j25!=r1)?j25 : (j26>=0&&j26!=r1)?j26 : (j27>=0&&j27!=r1)?j27 : (j28>=0&&j28!=r1)?j28 : (j29>=0&&j29!=r1)?j29: (j30>=0&&j30!=r1)?j30: j31;
|
|
|
|
if (r0 < 0) {
|
|
x0 = _mm_setzero_si128();
|
|
}
|
|
else if (r0 == s0) {
|
|
// i0 - i15 all from same source
|
|
x0 = permute16c< i0&-49, i1&-49, i2 &-49, i3 &-49, i4 &-49, i5 &-49, i6 &-49, i7 &-49,
|
|
i8&-49, i9&-49, i10&-49, i11&-49, i12&-49, i13&-49, i14&-49, i15&-49 >
|
|
(select4<r0> (a,b));
|
|
}
|
|
else if ((j2 <0||j2 ==r0||j2 ==s0) && (j3 <0||j3 ==r0||j3 ==s0) && (j4 <0||j4 ==r0||j4 ==s0) && (j5 <0||j5 ==r0||j5 ==s0) && (j6 <0||j6 ==r0||j6 ==s0) && (j7 <0||j7 ==r0||j7 ==s0) && (j8 <0||j8 ==r0||j8 ==s0) &&
|
|
(j9 <0||j9 ==r0||j9 ==s0) && (j10<0||j10==r0||j10==s0) && (j11<0||j11==r0||j11==s0) && (j12<0||j12==r0||j12==s0) && (j13<0||j13==r0||j13==s0) && (j14<0||j14==r0||j14==s0) && (j15<0||j15==r0||j15==s0)) {
|
|
// i0 - i15 all from two sources
|
|
const int k0 = i0 >= 0 ? i0 & 15 : i0;
|
|
const int k1 = (i1 >= 0 ? i1 & 15 : i1 ) | (j1 == s0 ? 16 : 0);
|
|
const int k2 = (i2 >= 0 ? i2 & 15 : i2 ) | (j2 == s0 ? 16 : 0);
|
|
const int k3 = (i3 >= 0 ? i3 & 15 : i3 ) | (j3 == s0 ? 16 : 0);
|
|
const int k4 = (i4 >= 0 ? i4 & 15 : i4 ) | (j4 == s0 ? 16 : 0);
|
|
const int k5 = (i5 >= 0 ? i5 & 15 : i5 ) | (j5 == s0 ? 16 : 0);
|
|
const int k6 = (i6 >= 0 ? i6 & 15 : i6 ) | (j6 == s0 ? 16 : 0);
|
|
const int k7 = (i7 >= 0 ? i7 & 15 : i7 ) | (j7 == s0 ? 16 : 0);
|
|
const int k8 = (i8 >= 0 ? i8 & 15 : i8 ) | (j8 == s0 ? 16 : 0);
|
|
const int k9 = (i9 >= 0 ? i9 & 15 : i9 ) | (j9 == s0 ? 16 : 0);
|
|
const int k10= (i10>= 0 ? i10& 15 : i10) | (j10== s0 ? 16 : 0);
|
|
const int k11= (i11>= 0 ? i11& 15 : i11) | (j11== s0 ? 16 : 0);
|
|
const int k12= (i12>= 0 ? i12& 15 : i12) | (j12== s0 ? 16 : 0);
|
|
const int k13= (i13>= 0 ? i13& 15 : i13) | (j13== s0 ? 16 : 0);
|
|
const int k14= (i14>= 0 ? i14& 15 : i14) | (j14== s0 ? 16 : 0);
|
|
const int k15= (i15>= 0 ? i15& 15 : i15) | (j15== s0 ? 16 : 0);
|
|
x0 = blend16c<k0,k1,k2,k3,k4,k5,k6,k7,k8,k9,k10,k11,k12,k13,k14,k15> (select4<r0>(a,b), select4<s0>(a,b));
|
|
}
|
|
else {
|
|
// i0 - i15 from three or four different sources
|
|
const int n0 = j0 >= 0 ? j0 /2*16 + 0 : j0;
|
|
const int n1 = j1 >= 0 ? j1 /2*16 + 1 : j1;
|
|
const int n2 = j2 >= 0 ? j2 /2*16 + 2 : j2;
|
|
const int n3 = j3 >= 0 ? j3 /2*16 + 3 : j3;
|
|
const int n4 = j4 >= 0 ? j4 /2*16 + 4 : j4;
|
|
const int n5 = j5 >= 0 ? j5 /2*16 + 5 : j5;
|
|
const int n6 = j6 >= 0 ? j6 /2*16 + 6 : j6;
|
|
const int n7 = j7 >= 0 ? j7 /2*16 + 7 : j7;
|
|
const int n8 = j8 >= 0 ? j8 /2*16 + 8 : j8;
|
|
const int n9 = j9 >= 0 ? j9 /2*16 + 9 : j9;
|
|
const int n10= j10>= 0 ? j10/2*16 +10 : j10;
|
|
const int n11= j11>= 0 ? j11/2*16 +11 : j11;
|
|
const int n12= j12>= 0 ? j12/2*16 +12 : j12;
|
|
const int n13= j13>= 0 ? j13/2*16 +13 : j13;
|
|
const int n14= j14>= 0 ? j14/2*16 +14 : j14;
|
|
const int n15= j15>= 0 ? j15/2*16 +15 : j15;
|
|
|
|
Vec16c x0a = blend16c< j0 & 2 ? -256 : i0 & 31, j1 & 2 ? -256 : i1 & 31, j2 & 2 ? -256 : i2 & 31, j3 & 2 ? -256 : i3 & 31, j4 & 2 ? -256 : i4 & 31, j5 & 2 ? -256 : i5 & 31, j6 & 2 ? -256 : i6 & 31, j7 & 2 ? -256 : i7 & 31,
|
|
j8 & 2 ? -256 : i8 & 31, j9 & 2 ? -256 : i9 & 31, j10 & 2 ? -256 : i10& 31, j11 & 2 ? -256 : i11& 31, j12 & 2 ? -256 : i12& 31, j13 & 2 ? -256 : i13& 31, j14 & 2 ? -256 : i14& 31, j15 & 2 ? -256 : i15& 31 > (a.get_low(),a.get_high());
|
|
Vec16c x0b = blend16c<(j0^2)& 6 ? -256 : i0 & 31, (j1^2)& 6 ? -256 : i1 & 31, (j2 ^2)& 6 ? -256 : i2 & 31, (j3 ^2)& 6 ? -256 : i3 & 31, (j4 ^2)& 6 ? -256 : i4 & 31, (j5 ^2)& 6 ? -256 : i5 & 31, (j6 ^2)& 6 ? -256 : i6 & 31, (j7 ^2)& 6 ? -256 : i7 & 31,
|
|
(j8^2)& 6 ? -256 : i8 & 31, (j9^2)& 6 ? -256 : i9 & 31, (j10^2)& 6 ? -256 : i10& 31, (j11^2)& 6 ? -256 : i11& 31, (j12^2)& 6 ? -256 : i12& 31, (j13^2)& 6 ? -256 : i13& 31, (j14^2)& 6 ? -256 : i14& 31, (j15^2)& 6 ? -256 : i15& 31 > (b.get_low(),b.get_high());
|
|
x0 = blend16c<n0, n1, n2, n3, n4, n5, n6, n7, n8, n9, n10, n11, n12, n13, n14, n15> (x0a, x0b);
|
|
}
|
|
|
|
if (r1 < 0) {
|
|
x1 = _mm_setzero_si128();
|
|
}
|
|
else if (r1 == s1) {
|
|
// i16 - i31 all from same source
|
|
x1 = permute16c< i16&-49, i17&-49, i18&-49, i19&-49, i20&-49, i21&-49, i22&-49, i23&-49,
|
|
i24&-49, i25&-49, i26&-49, i27&-49, i28&-49, i29&-49, i30&-49, i31&-49 >
|
|
(select4<r1> (a,b));
|
|
}
|
|
else if ((j18<0||j18==r1||j18==s1) && (j19<0||j19==r1||j19==s1) && (j20<0||j20==r1||j20==s1) && (j21<0||j21==r1||j21==s1) && (j22<0||j22==r1||j22==s1) && (j23<0||j23==r1||j23==s1) && (j24<0||j24==r1||j24==s1) &&
|
|
(j25<0||j25==r1||j25==s1) && (j26<0||j26==r1||j26==s1) && (j27<0||j27==r1||j27==s1) && (j28<0||j28==r1||j28==s1) && (j29<0||j29==r1||j29==s1) && (j30<0||j30==r1||j30==s1) && (j31<0||j31==r1||j31==s1)) {
|
|
// i16 - i31 all from two sources
|
|
const int k16= i16>= 0 ? i16& 15 : i16;
|
|
const int k17= (i17>= 0 ? i17& 15 : i17) | (j17== s1 ? 16 : 0);
|
|
const int k18= (i18>= 0 ? i18& 15 : i18) | (j18== s1 ? 16 : 0);
|
|
const int k19= (i19>= 0 ? i19& 15 : i19) | (j19== s1 ? 16 : 0);
|
|
const int k20= (i20>= 0 ? i20& 15 : i20) | (j20== s1 ? 16 : 0);
|
|
const int k21= (i21>= 0 ? i21& 15 : i21) | (j21== s1 ? 16 : 0);
|
|
const int k22= (i22>= 0 ? i22& 15 : i22) | (j22== s1 ? 16 : 0);
|
|
const int k23= (i23>= 0 ? i23& 15 : i23) | (j23== s1 ? 16 : 0);
|
|
const int k24= (i24>= 0 ? i24& 15 : i24) | (j24== s1 ? 16 : 0);
|
|
const int k25= (i25>= 0 ? i25& 15 : i25) | (j25== s1 ? 16 : 0);
|
|
const int k26= (i26>= 0 ? i26& 15 : i26) | (j26== s1 ? 16 : 0);
|
|
const int k27= (i27>= 0 ? i27& 15 : i27) | (j27== s1 ? 16 : 0);
|
|
const int k28= (i28>= 0 ? i28& 15 : i28) | (j28== s1 ? 16 : 0);
|
|
const int k29= (i29>= 0 ? i29& 15 : i29) | (j29== s1 ? 16 : 0);
|
|
const int k30= (i30>= 0 ? i30& 15 : i30) | (j30== s1 ? 16 : 0);
|
|
const int k31= (i31>= 0 ? i31& 15 : i31) | (j31== s1 ? 16 : 0);
|
|
x1 = blend16c<k16,k17,k18,k19,k20,k21,k22,k23,k24,k25,k26,k27,k28,k29,k30,k31> (select4<r1>(a,b), select4<s1>(a,b));
|
|
}
|
|
else {
|
|
// i16 - i31 from three or four different sources
|
|
const int n16= j16>= 0 ? j16/2*16 + 0 : j16;
|
|
const int n17= j17>= 0 ? j17/2*16 + 1 : j17;
|
|
const int n18= j18>= 0 ? j18/2*16 + 2 : j18;
|
|
const int n19= j19>= 0 ? j19/2*16 + 3 : j19;
|
|
const int n20= j20>= 0 ? j20/2*16 + 4 : j20;
|
|
const int n21= j21>= 0 ? j21/2*16 + 5 : j21;
|
|
const int n22= j22>= 0 ? j22/2*16 + 6 : j22;
|
|
const int n23= j23>= 0 ? j23/2*16 + 7 : j23;
|
|
const int n24= j24>= 0 ? j24/2*16 + 8 : j24;
|
|
const int n25= j25>= 0 ? j25/2*16 + 9 : j25;
|
|
const int n26= j26>= 0 ? j26/2*16 +10 : j26;
|
|
const int n27= j27>= 0 ? j27/2*16 +11 : j27;
|
|
const int n28= j28>= 0 ? j28/2*16 +12 : j28;
|
|
const int n29= j29>= 0 ? j29/2*16 +13 : j29;
|
|
const int n30= j30>= 0 ? j30/2*16 +14 : j30;
|
|
const int n31= j31>= 0 ? j31/2*16 +15 : j31;
|
|
x1 = blend16c<n16, n17, n18, n19, n20, n21, n22, n23, n24, n25, n26, n27, n28, n29, n30, n31> (
|
|
blend16c< j16 & 2 ? -256 : i16& 31, j17 & 2 ? -256 : i17& 31, j18 & 2 ? -256 : i18& 31, j19 & 2 ? -256 : i19& 31, j20 & 2 ? -256 : i20& 31, j21 & 2 ? -256 : i21& 31, j22 & 2 ? -256 : i22& 31, j23 & 2 ? -256 : i23& 31,
|
|
j24 & 2 ? -256 : i24& 31, j25 & 2 ? -256 : i25& 31, j26 & 2 ? -256 : i26& 31, j27 & 2 ? -256 : i27& 31, j28 & 2 ? -256 : i28& 31, j29 & 2 ? -256 : i29& 31, j30 & 2 ? -256 : i30& 31, j31 & 2 ? -256 : i31& 31 > (a.get_low(),a.get_high()),
|
|
blend16c<(j16^2)& 6 ? -256 : i16& 31, (j17^2)& 6 ? -256 : i17& 31, (j18^2)& 6 ? -256 : i18& 31, (j19^2)& 6 ? -256 : i19& 31, (j20^2)& 6 ? -256 : i20& 31, (j21^2)& 6 ? -256 : i21& 31, (j22^2)& 6 ? -256 : i22& 31, (j23^2)& 6 ? -256 : i23& 31,
|
|
(j24^2)& 6 ? -256 : i24& 31, (j25^2)& 6 ? -256 : i25& 31, (j26^2)& 6 ? -256 : i26& 31, (j27^2)& 6 ? -256 : i27& 31, (j28^2)& 6 ? -256 : i28& 31, (j29^2)& 6 ? -256 : i29& 31, (j30^2)& 6 ? -256 : i30& 31, (j31^2)& 6 ? -256 : i31& 31 > (b.get_low(),b.get_high()));
|
|
}
|
|
return Vec32c(x0,x1);
|
|
}
|
|
|
|
template <
|
|
int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7,
|
|
int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15,
|
|
int i16, int i17, int i18, int i19, int i20, int i21, int i22, int i23,
|
|
int i24, int i25, int i26, int i27, int i28, int i29, int i30, int i31 >
|
|
static inline Vec32uc blend32uc(Vec32uc const & a, Vec32uc const & b) {
|
|
return Vec32uc (blend32c<i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15,
|
|
i16,i17,i18,i19,i20,i21,i22,i23,i24,i25,i26,i27,i28,i29,i30,i31> (a, b));
|
|
}
|
|
|
|
|
|
/*****************************************************************************
|
|
*
|
|
* Vector lookup functions
|
|
*
|
|
******************************************************************************
|
|
*
|
|
* These functions use vector elements as indexes into a table.
|
|
* The table is given as one or more vectors or as an array.
|
|
*
|
|
* This can be used for several purposes:
|
|
* - table lookup
|
|
* - permute or blend with variable indexes
|
|
* - blend from more than two sources
|
|
* - gather non-contiguous data
|
|
*
|
|
* An index out of range may produce any value - the actual value produced is
|
|
* implementation dependent and may be different for different instruction
|
|
* sets. An index out of range does not produce an error message or exception.
|
|
*
|
|
* Example:
|
|
* Vec8i a(2,0,0,6,4,3,5,0); // index a is ( 2, 0, 0, 6, 4, 3, 5, 0)
|
|
* Vec8i b(100,101,102,103,104,105,106,107); // table b is (100, 101, 102, 103, 104, 105, 106, 107)
|
|
* Vec8i c;
|
|
* c = lookup8 (a,b); // c is (102, 100, 100, 106, 104, 103, 105, 100)
|
|
*
|
|
*****************************************************************************/
|
|
|
|
static inline Vec32c lookup32(Vec32c const & index, Vec32c const & table) {
|
|
#if defined (__XOP__) // AMD XOP instruction set. Use VPPERM
|
|
Vec16c t0 = _mm_perm_epi8(table.get_low(), table.get_high(), index.get_low());
|
|
Vec16c t1 = _mm_perm_epi8(table.get_low(), table.get_high(), index.get_high());
|
|
return Vec32c(t0, t1);
|
|
#else
|
|
Vec16c t0 = lookup32(index.get_low() , table.get_low(), table.get_high());
|
|
Vec16c t1 = lookup32(index.get_high(), table.get_low(), table.get_high());
|
|
return Vec32c(t0, t1);
|
|
#endif
|
|
}
|
|
|
|
template <int n>
|
|
static inline Vec32c lookup(Vec32uc const & index, void const * table) {
|
|
if (n <= 0) return 0;
|
|
if (n <= 16) {
|
|
Vec16c tt = Vec16c().load(table);
|
|
Vec16c r0 = lookup16(index.get_low(), tt);
|
|
Vec16c r1 = lookup16(index.get_high(), tt);
|
|
return Vec32c(r0, r1);
|
|
}
|
|
if (n <= 32) return lookup32(index, Vec32c().load(table));
|
|
// n > 32. Limit index
|
|
Vec32uc index1;
|
|
if ((n & (n-1)) == 0) {
|
|
// n is a power of 2, make index modulo n
|
|
index1 = Vec32uc(index) & uint8_t(n-1);
|
|
}
|
|
else {
|
|
// n is not a power of 2, limit to n-1
|
|
index1 = min(Vec32uc(index), uint8_t(n-1));
|
|
}
|
|
uint8_t ii[32]; index1.store(ii);
|
|
int8_t rr[32];
|
|
for (int j = 0; j < 32; j++) {
|
|
rr[j] = ((int8_t*)table)[ii[j]];
|
|
}
|
|
return Vec32c().load(rr);
|
|
}
|
|
|
|
template <int n>
|
|
static inline Vec32c lookup(Vec32c const & index, void const * table) {
|
|
return lookup<n>(Vec32uc(index), table);
|
|
}
|
|
|
|
|
|
static inline Vec16s lookup16(Vec16s const & index, Vec16s const & table) {
|
|
Vec8s t0 = lookup16(index.get_low() , table.get_low(), table.get_high());
|
|
Vec8s t1 = lookup16(index.get_high(), table.get_low(), table.get_high());
|
|
return Vec16s(t0, t1);
|
|
}
|
|
|
|
template <int n>
|
|
static inline Vec16s lookup(Vec16s const & index, void const * table) {
|
|
if (n <= 0) return 0;
|
|
if (n <= 8) {
|
|
Vec8s table1 = Vec8s().load(table);
|
|
return Vec16s(
|
|
lookup8 (index.get_low(), table1),
|
|
lookup8 (index.get_high(), table1));
|
|
}
|
|
if (n <= 16) return lookup16(index, Vec16s().load(table));
|
|
// n > 16. Limit index
|
|
Vec16us i1;
|
|
if ((n & (n-1)) == 0) {
|
|
// n is a power of 2, make index modulo n
|
|
i1 = Vec16us(index) & (n-1);
|
|
}
|
|
else {
|
|
// n is not a power of 2, limit to n-1
|
|
i1 = min(Vec16us(index), n-1);
|
|
}
|
|
int16_t const * t = (int16_t const *)table;
|
|
return Vec16s(t[i1[0]],t[i1[1]],t[i1[2]],t[i1[3]],t[i1[4]],t[i1[5]],t[i1[6]],t[i1[7]],
|
|
t[i1[8]],t[i1[9]],t[i1[10]],t[i1[11]],t[i1[12]],t[i1[13]],t[i1[14]],t[i1[15]]);
|
|
}
|
|
|
|
static inline Vec8i lookup8(Vec8i const & index, Vec8i const & table) {
|
|
Vec4i t0 = lookup8(index.get_low() , table.get_low(), table.get_high());
|
|
Vec4i t1 = lookup8(index.get_high(), table.get_low(), table.get_high());
|
|
return Vec8i(t0, t1);
|
|
}
|
|
|
|
template <int n>
|
|
static inline Vec8i lookup(Vec8i const & index, void const * table) {
|
|
if (n <= 0) return 0;
|
|
if (n <= 4) {
|
|
Vec4i table1 = Vec4i().load(table);
|
|
return Vec8i(
|
|
lookup4 (index.get_low(), table1),
|
|
lookup4 (index.get_high(), table1));
|
|
}
|
|
if (n <= 8) {
|
|
return lookup8(index, Vec8i().load(table));
|
|
}
|
|
// n > 8. Limit index
|
|
Vec8ui i1;
|
|
if ((n & (n-1)) == 0) {
|
|
// n is a power of 2, make index modulo n
|
|
i1 = Vec8ui(index) & (n-1);
|
|
}
|
|
else {
|
|
// n is not a power of 2, limit to n-1
|
|
i1 = min(Vec8ui(index), n-1);
|
|
}
|
|
int32_t const * t = (int32_t const *)table;
|
|
return Vec8i(t[i1[0]],t[i1[1]],t[i1[2]],t[i1[3]],t[i1[4]],t[i1[5]],t[i1[6]],t[i1[7]]);
|
|
}
|
|
|
|
static inline Vec4q lookup4(Vec4q const & index, Vec4q const & table) {
|
|
return lookup8(Vec8i(index * 0x200000002ll + 0x100000000ll), Vec8i(table));
|
|
}
|
|
|
|
template <int n>
|
|
static inline Vec4q lookup(Vec4q const & index, void const * table) {
|
|
if (n <= 0) return 0;
|
|
// n > 0. Limit index
|
|
Vec4uq index1;
|
|
if ((n & (n-1)) == 0) {
|
|
// n is a power of 2, make index modulo n
|
|
index1 = Vec4uq(index) & (n-1);
|
|
}
|
|
else {
|
|
// n is not a power of 2, limit to n-1.
|
|
// There is no 64-bit min instruction, but we can use the 32-bit unsigned min,
|
|
// since n is a 32-bit integer
|
|
index1 = Vec4uq(min(Vec8ui(index), constant8i<n-1, 0, n-1, 0, n-1, 0, n-1, 0>()));
|
|
}
|
|
uint32_t ii[8]; index1.store(ii); // use only lower 32 bits of each index
|
|
int64_t const * tt = (int64_t const *)table;
|
|
return Vec4q(tt[ii[0]], tt[ii[2]], tt[ii[4]], tt[ii[6]]);
|
|
}
|
|
|
|
|
|
/*****************************************************************************
|
|
*
|
|
* Other permutations with variable indexes
|
|
*
|
|
*****************************************************************************/
|
|
|
|
// Function shift_bytes_up: shift whole vector left by b bytes.
|
|
// You may use a permute function instead if b is a compile-time constant
|
|
static inline Vec32c shift_bytes_up(Vec32c const & a, int b) {
|
|
if (b < 16) {
|
|
return Vec32c(shift_bytes_up(a.get_low(),b), shift_bytes_up(a.get_high(),b) | shift_bytes_down(a.get_low(),16-b));
|
|
}
|
|
else {
|
|
return Vec32c(Vec16c(0), shift_bytes_up(a.get_high(),b-16));
|
|
}
|
|
}
|
|
|
|
// Function shift_bytes_down: shift whole vector right by b bytes
|
|
// You may use a permute function instead if b is a compile-time constant
|
|
static inline Vec32c shift_bytes_down(Vec32c const & a, int b) {
|
|
if (b < 16) {
|
|
return Vec32c(shift_bytes_down(a.get_low(),b) | shift_bytes_up(a.get_high(),16-b), shift_bytes_down(a.get_high(),b));
|
|
}
|
|
else {
|
|
return Vec32c(shift_bytes_down(a.get_high(),b-16), Vec16c(0));
|
|
}
|
|
}
|
|
|
|
/*****************************************************************************
|
|
*
|
|
* Gather functions with fixed indexes
|
|
*
|
|
*****************************************************************************/
|
|
// Load elements from array a with indices i0, i1, i2, i3, i4, i5, i6, i7
|
|
template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
|
|
static inline Vec8i gather8i(void const * a) {
|
|
Static_error_check<(i0|i1|i2|i3|i4|i5|i6|i7)>=0> Negative_array_index; // Error message if index is negative
|
|
const int i01min = i0 < i1 ? i0 : i1;
|
|
const int i23min = i2 < i3 ? i2 : i3;
|
|
const int i45min = i4 < i5 ? i4 : i5;
|
|
const int i67min = i6 < i7 ? i6 : i7;
|
|
const int i0123min = i01min < i23min ? i01min : i23min;
|
|
const int i4567min = i45min < i67min ? i45min : i67min;
|
|
const int imin = i0123min < i4567min ? i0123min : i4567min;
|
|
const int i01max = i0 > i1 ? i0 : i1;
|
|
const int i23max = i2 > i3 ? i2 : i3;
|
|
const int i45max = i4 > i5 ? i4 : i5;
|
|
const int i67max = i6 > i7 ? i6 : i7;
|
|
const int i0123max = i01max > i23max ? i01max : i23max;
|
|
const int i4567max = i45max > i67max ? i45max : i67max;
|
|
const int imax = i0123max > i4567max ? i0123max : i4567max;
|
|
|
|
if (imax - imin <= 7) {
|
|
// load one contiguous block and permute
|
|
if (imax > 7) {
|
|
// make sure we don't read past the end of the array
|
|
Vec8i b = Vec8i().load((int32_t const *)a + imax-7);
|
|
return permute8i<i0-imax+7, i1-imax+7, i2-imax+7, i3-imax+7, i4-imax+7, i5-imax+7, i6-imax+7, i7-imax+7>(b);
|
|
}
|
|
else {
|
|
Vec8i b = Vec8i().load((int32_t const *)a + imin);
|
|
return permute8i<i0-imin, i1-imin, i2-imin, i3-imin, i4-imin, i5-imin, i6-imin, i7-imin>(b);
|
|
}
|
|
}
|
|
if ((i0<imin+8 || i0>imax-8) && (i1<imin+8 || i1>imax-8) && (i2<imin+8 || i2>imax-8) && (i3<imin+8 || i3>imax-8)
|
|
&& (i4<imin+8 || i4>imax-8) && (i5<imin+8 || i5>imax-8) && (i6<imin+8 || i6>imax-8) && (i7<imin+8 || i7>imax-8)) {
|
|
// load two contiguous blocks and blend
|
|
Vec8i b = Vec8i().load((int32_t const *)a + imin);
|
|
Vec8i c = Vec8i().load((int32_t const *)a + imax-7);
|
|
const int j0 = i0<imin+8 ? i0-imin : 15-imax+i0;
|
|
const int j1 = i1<imin+8 ? i1-imin : 15-imax+i1;
|
|
const int j2 = i2<imin+8 ? i2-imin : 15-imax+i2;
|
|
const int j3 = i3<imin+8 ? i3-imin : 15-imax+i3;
|
|
const int j4 = i4<imin+8 ? i4-imin : 15-imax+i4;
|
|
const int j5 = i5<imin+8 ? i5-imin : 15-imax+i5;
|
|
const int j6 = i6<imin+8 ? i6-imin : 15-imax+i6;
|
|
const int j7 = i7<imin+8 ? i7-imin : 15-imax+i7;
|
|
return blend8i<j0, j1, j2, j3, j4, j5, j6, j7>(b, c);
|
|
}
|
|
// use lookup function
|
|
return lookup<imax+1>(Vec8i(i0,i1,i2,i3,i4,i5,i6,i7), a);
|
|
}
|
|
|
|
template <int i0, int i1, int i2, int i3>
|
|
static inline Vec4q gather4q(void const * a) {
|
|
Static_error_check<(i0|i1|i2|i3)>=0> Negative_array_index; // Error message if index is negative
|
|
const int i01min = i0 < i1 ? i0 : i1;
|
|
const int i23min = i2 < i3 ? i2 : i3;
|
|
const int imin = i01min < i23min ? i01min : i23min;
|
|
const int i01max = i0 > i1 ? i0 : i1;
|
|
const int i23max = i2 > i3 ? i2 : i3;
|
|
const int imax = i01max > i23max ? i01max : i23max;
|
|
if (imax - imin <= 3) {
|
|
// load one contiguous block and permute
|
|
if (imax > 3) {
|
|
// make sure we don't read past the end of the array
|
|
Vec4q b = Vec4q().load((int64_t const *)a + imax-3);
|
|
return permute4q<i0-imax+3, i1-imax+3, i2-imax+3, i3-imax+3>(b);
|
|
}
|
|
else {
|
|
Vec4q b = Vec4q().load((int64_t const *)a + imin);
|
|
return permute4q<i0-imin, i1-imin, i2-imin, i3-imin>(b);
|
|
}
|
|
}
|
|
if ((i0<imin+4 || i0>imax-4) && (i1<imin+4 || i1>imax-4) && (i2<imin+4 || i2>imax-4) && (i3<imin+4 || i3>imax-4)) {
|
|
// load two contiguous blocks and blend
|
|
Vec4q b = Vec4q().load((int64_t const *)a + imin);
|
|
Vec4q c = Vec4q().load((int64_t const *)a + imax-3);
|
|
const int j0 = i0<imin+4 ? i0-imin : 7-imax+i0;
|
|
const int j1 = i1<imin+4 ? i1-imin : 7-imax+i1;
|
|
const int j2 = i2<imin+4 ? i2-imin : 7-imax+i2;
|
|
const int j3 = i3<imin+4 ? i3-imin : 7-imax+i3;
|
|
return blend4q<j0, j1, j2, j3>(b, c);
|
|
}
|
|
// use lookup function
|
|
return lookup<imax+1>(Vec4q(i0,i1,i2,i3), a);
|
|
}
|
|
|
|
|
|
|
|
|
|
/*****************************************************************************
|
|
*
|
|
* Functions for conversion between integer sizes
|
|
*
|
|
*****************************************************************************/
|
|
|
|
// Extend 8-bit integers to 16-bit integers, signed and unsigned
|
|
|
|
// Function extend_low : extends the low 16 elements to 16 bits with sign extension
|
|
static inline Vec16s extend_low (Vec32c const & a) {
|
|
return Vec16s(extend_low(a.get_low()), extend_high(a.get_low()));
|
|
}
|
|
|
|
// Function extend_high : extends the high 16 elements to 16 bits with sign extension
|
|
static inline Vec16s extend_high (Vec32c const & a) {
|
|
return Vec16s(extend_low(a.get_high()), extend_high(a.get_high()));
|
|
}
|
|
|
|
// Function extend_low : extends the low 16 elements to 16 bits with zero extension
|
|
static inline Vec16us extend_low (Vec32uc const & a) {
|
|
return Vec16us(extend_low(a.get_low()), extend_high(a.get_low()));
|
|
}
|
|
|
|
// Function extend_high : extends the high 19 elements to 16 bits with zero extension
|
|
static inline Vec16us extend_high (Vec32uc const & a) {
|
|
return Vec16us(extend_low(a.get_high()), extend_high(a.get_high()));
|
|
}
|
|
|
|
// Extend 16-bit integers to 32-bit integers, signed and unsigned
|
|
|
|
// Function extend_low : extends the low 8 elements to 32 bits with sign extension
|
|
static inline Vec8i extend_low (Vec16s const & a) {
|
|
return Vec8i(extend_low(a.get_low()), extend_high(a.get_low()));
|
|
}
|
|
|
|
// Function extend_high : extends the high 8 elements to 32 bits with sign extension
|
|
static inline Vec8i extend_high (Vec16s const & a) {
|
|
return Vec8i(extend_low(a.get_high()), extend_high(a.get_high()));
|
|
}
|
|
|
|
// Function extend_low : extends the low 8 elements to 32 bits with zero extension
|
|
static inline Vec8ui extend_low (Vec16us const & a) {
|
|
return Vec8ui(extend_low(a.get_low()), extend_high(a.get_low()));
|
|
}
|
|
|
|
// Function extend_high : extends the high 8 elements to 32 bits with zero extension
|
|
static inline Vec8ui extend_high (Vec16us const & a) {
|
|
return Vec8ui(extend_low(a.get_high()), extend_high(a.get_high()));
|
|
}
|
|
|
|
// Extend 32-bit integers to 64-bit integers, signed and unsigned
|
|
|
|
// Function extend_low : extends the low 4 elements to 64 bits with sign extension
|
|
static inline Vec4q extend_low (Vec8i const & a) {
|
|
return Vec4q(extend_low(a.get_low()), extend_high(a.get_low()));
|
|
}
|
|
|
|
// Function extend_high : extends the high 4 elements to 64 bits with sign extension
|
|
static inline Vec4q extend_high (Vec8i const & a) {
|
|
return Vec4q(extend_low(a.get_high()), extend_high(a.get_high()));
|
|
}
|
|
|
|
// Function extend_low : extends the low 4 elements to 64 bits with zero extension
|
|
static inline Vec4uq extend_low (Vec8ui const & a) {
|
|
return Vec4uq(extend_low(a.get_low()), extend_high(a.get_low()));
|
|
}
|
|
|
|
// Function extend_high : extends the high 4 elements to 64 bits with zero extension
|
|
static inline Vec4uq extend_high (Vec8ui const & a) {
|
|
return Vec4uq(extend_low(a.get_high()), extend_high(a.get_high()));
|
|
}
|
|
|
|
// Compress 16-bit integers to 8-bit integers, signed and unsigned, with and without saturation
|
|
|
|
// Function compress : packs two vectors of 16-bit integers into one vector of 8-bit integers
|
|
// Overflow wraps around
|
|
static inline Vec32c compress (Vec16s const & low, Vec16s const & high) {
|
|
return Vec32c(compress(low.get_low(),low.get_high()), compress(high.get_low(),high.get_high()));
|
|
}
|
|
|
|
// Function compress : packs two vectors of 16-bit integers into one vector of 8-bit integers
|
|
// Signed, with saturation
|
|
static inline Vec32c compress_saturated (Vec16s const & low, Vec16s const & high) {
|
|
return Vec32c(compress_saturated(low.get_low(),low.get_high()), compress_saturated(high.get_low(),high.get_high()));
|
|
}
|
|
|
|
// Function compress : packs two vectors of 16-bit integers to one vector of 8-bit integers
|
|
// Unsigned, overflow wraps around
|
|
static inline Vec32uc compress (Vec16us const & low, Vec16us const & high) {
|
|
return Vec32uc(compress(low.get_low(),low.get_high()), compress(high.get_low(),high.get_high()));
|
|
}
|
|
|
|
// Function compress : packs two vectors of 16-bit integers into one vector of 8-bit integers
|
|
// Unsigned, with saturation
|
|
static inline Vec32uc compress_saturated (Vec16us const & low, Vec16us const & high) {
|
|
return Vec32uc(compress_saturated(low.get_low(),low.get_high()), compress_saturated(high.get_low(),high.get_high()));
|
|
}
|
|
|
|
// Compress 32-bit integers to 16-bit integers, signed and unsigned, with and without saturation
|
|
|
|
// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers
|
|
// Overflow wraps around
|
|
static inline Vec16s compress (Vec8i const & low, Vec8i const & high) {
|
|
return Vec16s(compress(low.get_low(),low.get_high()), compress(high.get_low(),high.get_high()));
|
|
}
|
|
|
|
// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers
|
|
// Signed with saturation
|
|
static inline Vec16s compress_saturated (Vec8i const & low, Vec8i const & high) {
|
|
return Vec16s(compress_saturated(low.get_low(),low.get_high()), compress_saturated(high.get_low(),high.get_high()));
|
|
}
|
|
|
|
// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers
|
|
// Overflow wraps around
|
|
static inline Vec16us compress (Vec8ui const & low, Vec8ui const & high) {
|
|
return Vec16us(compress(low.get_low(),low.get_high()), compress(high.get_low(),high.get_high()));
|
|
}
|
|
|
|
// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers
|
|
// Unsigned, with saturation
|
|
static inline Vec16us compress_saturated (Vec8ui const & low, Vec8ui const & high) {
|
|
return Vec16us(compress_saturated(low.get_low(),low.get_high()), compress_saturated(high.get_low(),high.get_high()));
|
|
}
|
|
|
|
// Compress 64-bit integers to 32-bit integers, signed and unsigned, with and without saturation
|
|
|
|
// Function compress : packs two vectors of 64-bit integers into one vector of 32-bit integers
|
|
// Overflow wraps around
|
|
static inline Vec8i compress (Vec4q const & low, Vec4q const & high) {
|
|
return Vec8i(compress(low.get_low(),low.get_high()), compress(high.get_low(),high.get_high()));
|
|
}
|
|
|
|
// Function compress : packs two vectors of 64-bit integers into one vector of 32-bit integers
|
|
// Signed, with saturation
|
|
static inline Vec8i compress_saturated (Vec4q const & low, Vec4q const & high) {
|
|
return Vec8i(compress_saturated(low.get_low(),low.get_high()), compress_saturated(high.get_low(),high.get_high()));
|
|
}
|
|
|
|
// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers
|
|
// Overflow wraps around
|
|
static inline Vec8ui compress (Vec4uq const & low, Vec4uq const & high) {
|
|
return Vec8ui (compress((Vec4q)low, (Vec4q)high));
|
|
}
|
|
|
|
// Function compress : packs two vectors of 64-bit integers into one vector of 32-bit integers
|
|
// Unsigned, with saturation
|
|
static inline Vec8ui compress_saturated (Vec4uq const & low, Vec4uq const & high) {
|
|
return Vec8ui(compress_saturated(low.get_low(),low.get_high()), compress_saturated(high.get_low(),high.get_high()));
|
|
}
|
|
|
|
|
|
/*****************************************************************************
|
|
*
|
|
* Integer division 2: divisor is a compile-time constant
|
|
*
|
|
*****************************************************************************/
|
|
|
|
// Divide Vec8i by compile-time constant
|
|
template <int32_t d>
|
|
static inline Vec8i divide_by_i(Vec8i const & a) {
|
|
return Vec8i( divide_by_i<d>(a.get_low()), divide_by_i<d>(a.get_high()));
|
|
}
|
|
|
|
// define Vec8i a / const_int(d)
|
|
template <int32_t d>
|
|
static inline Vec8i operator / (Vec8i const & a, Const_int_t<d>) {
|
|
return divide_by_i<d>(a);
|
|
}
|
|
|
|
// define Vec8i a / const_uint(d)
|
|
template <uint32_t d>
|
|
static inline Vec8i operator / (Vec8i const & a, Const_uint_t<d>) {
|
|
Static_error_check< (d<0x80000000u) > Error_overflow_dividing_signed_by_unsigned; // Error: dividing signed by overflowing unsigned
|
|
return divide_by_i<int32_t(d)>(a); // signed divide
|
|
}
|
|
|
|
// vector operator /= : divide
|
|
template <int32_t d>
|
|
static inline Vec8i & operator /= (Vec8i & a, Const_int_t<d> b) {
|
|
a = a / b;
|
|
return a;
|
|
}
|
|
|
|
// vector operator /= : divide
|
|
template <uint32_t d>
|
|
static inline Vec8i & operator /= (Vec8i & a, Const_uint_t<d> b) {
|
|
a = a / b;
|
|
return a;
|
|
}
|
|
|
|
|
|
// Divide Vec8ui by compile-time constant
|
|
template <uint32_t d>
|
|
static inline Vec8ui divide_by_ui(Vec8ui const & a) {
|
|
return Vec8ui( divide_by_ui<d>(a.get_low()), divide_by_ui<d>(a.get_high()));
|
|
}
|
|
|
|
// define Vec8ui a / const_uint(d)
|
|
template <uint32_t d>
|
|
static inline Vec8ui operator / (Vec8ui const & a, Const_uint_t<d>) {
|
|
return divide_by_ui<d>(a);
|
|
}
|
|
|
|
// define Vec8ui a / const_int(d)
|
|
template <int32_t d>
|
|
static inline Vec8ui operator / (Vec8ui const & a, Const_int_t<d>) {
|
|
Static_error_check< (d>=0) > Error_dividing_unsigned_by_negative;// Error: dividing unsigned by negative is ambiguous
|
|
return divide_by_ui<d>(a); // unsigned divide
|
|
}
|
|
|
|
// vector operator /= : divide
|
|
template <uint32_t d>
|
|
static inline Vec8ui & operator /= (Vec8ui & a, Const_uint_t<d> b) {
|
|
a = a / b;
|
|
return a;
|
|
}
|
|
|
|
// vector operator /= : divide
|
|
template <int32_t d>
|
|
static inline Vec8ui & operator /= (Vec8ui & a, Const_int_t<d> b) {
|
|
a = a / b;
|
|
return a;
|
|
}
|
|
|
|
|
|
// Divide Vec16s by compile-time constant
|
|
template <int d>
|
|
static inline Vec16s divide_by_i(Vec16s const & a) {
|
|
return Vec16s( divide_by_i<d>(a.get_low()), divide_by_i<d>(a.get_high()));
|
|
}
|
|
|
|
// define Vec16s a / const_int(d)
|
|
template <int d>
|
|
static inline Vec16s operator / (Vec16s const & a, Const_int_t<d>) {
|
|
return divide_by_i<d>(a);
|
|
}
|
|
|
|
// define Vec16s a / const_uint(d)
|
|
template <uint32_t d>
|
|
static inline Vec16s operator / (Vec16s const & a, Const_uint_t<d>) {
|
|
Static_error_check< (d<0x8000u) > Error_overflow_dividing_signed_by_unsigned; // Error: dividing signed by overflowing unsigned
|
|
return divide_by_i<int(d)>(a); // signed divide
|
|
}
|
|
|
|
// vector operator /= : divide
|
|
template <int32_t d>
|
|
static inline Vec16s & operator /= (Vec16s & a, Const_int_t<d> b) {
|
|
a = a / b;
|
|
return a;
|
|
}
|
|
|
|
// vector operator /= : divide
|
|
template <uint32_t d>
|
|
static inline Vec16s & operator /= (Vec16s & a, Const_uint_t<d> b) {
|
|
a = a / b;
|
|
return a;
|
|
}
|
|
|
|
|
|
// Divide Vec16us by compile-time constant
|
|
template <uint32_t d>
|
|
static inline Vec16us divide_by_ui(Vec16us const & a) {
|
|
return Vec16us( divide_by_ui<d>(a.get_low()), divide_by_ui<d>(a.get_high()));
|
|
}
|
|
|
|
// define Vec16us a / const_uint(d)
|
|
template <uint32_t d>
|
|
static inline Vec16us operator / (Vec16us const & a, Const_uint_t<d>) {
|
|
return divide_by_ui<d>(a);
|
|
}
|
|
|
|
// define Vec16us a / const_int(d)
|
|
template <int d>
|
|
static inline Vec16us operator / (Vec16us const & a, Const_int_t<d>) {
|
|
Static_error_check< (d>=0) > Error_dividing_unsigned_by_negative;// Error: dividing unsigned by negative is ambiguous
|
|
return divide_by_ui<d>(a); // unsigned divide
|
|
}
|
|
|
|
// vector operator /= : divide
|
|
template <uint32_t d>
|
|
static inline Vec16us & operator /= (Vec16us & a, Const_uint_t<d> b) {
|
|
a = a / b;
|
|
return a;
|
|
}
|
|
|
|
// vector operator /= : divide
|
|
template <int32_t d>
|
|
static inline Vec16us & operator /= (Vec16us & a, Const_int_t<d> b) {
|
|
a = a / b;
|
|
return a;
|
|
}
|
|
|
|
|
|
// define Vec32c a / const_int(d)
|
|
template <int d>
|
|
static inline Vec32c operator / (Vec32c const & a, Const_int_t<d>) {
|
|
// expand into two Vec16s
|
|
Vec16s low = extend_low(a) / Const_int_t<d>();
|
|
Vec16s high = extend_high(a) / Const_int_t<d>();
|
|
return compress(low,high);
|
|
}
|
|
|
|
// define Vec32c a / const_uint(d)
|
|
template <uint32_t d>
|
|
static inline Vec32c operator / (Vec32c const & a, Const_uint_t<d>) {
|
|
Static_error_check< (uint8_t(d)<0x80u) > Error_overflow_dividing_signed_by_unsigned; // Error: dividing signed by overflowing unsigned
|
|
return a / Const_int_t<d>(); // signed divide
|
|
}
|
|
|
|
// vector operator /= : divide
|
|
template <int32_t d>
|
|
static inline Vec32c & operator /= (Vec32c & a, Const_int_t<d> b) {
|
|
a = a / b;
|
|
return a;
|
|
}
|
|
// vector operator /= : divide
|
|
template <uint32_t d>
|
|
static inline Vec32c & operator /= (Vec32c & a, Const_uint_t<d> b) {
|
|
a = a / b;
|
|
return a;
|
|
}
|
|
|
|
// define Vec32uc a / const_uint(d)
|
|
template <uint32_t d>
|
|
static inline Vec32uc operator / (Vec32uc const & a, Const_uint_t<d>) {
|
|
// expand into two Vec16us
|
|
Vec16us low = extend_low(a) / Const_uint_t<d>();
|
|
Vec16us high = extend_high(a) / Const_uint_t<d>();
|
|
return compress(low,high);
|
|
}
|
|
|
|
// define Vec32uc a / const_int(d)
|
|
template <int d>
|
|
static inline Vec32uc operator / (Vec32uc const & a, Const_int_t<d>) {
|
|
Static_error_check< (int8_t(d)>=0) > Error_dividing_unsigned_by_negative;// Error: dividing unsigned by negative is ambiguous
|
|
return a / Const_uint_t<d>(); // unsigned divide
|
|
}
|
|
|
|
// vector operator /= : divide
|
|
template <uint32_t d>
|
|
static inline Vec32uc & operator /= (Vec32uc & a, Const_uint_t<d> b) {
|
|
a = a / b;
|
|
return a;
|
|
}
|
|
|
|
// vector operator /= : divide
|
|
template <int32_t d>
|
|
static inline Vec32uc & operator /= (Vec32uc & a, Const_int_t<d> b) {
|
|
a = a / b;
|
|
return a;
|
|
}
|
|
|
|
/*****************************************************************************
|
|
*
|
|
* Horizontal scan functions
|
|
*
|
|
*****************************************************************************/
|
|
|
|
// Get index to the first element that is true. Return -1 if all are false
|
|
static inline int horizontal_find_first(Vec32cb const & x) {
|
|
int a1 = horizontal_find_first(x.get_low());
|
|
if (a1 >= 0) return a1;
|
|
int a2 = horizontal_find_first(x.get_high());
|
|
if (a2 < 0) return a2;
|
|
return a2 + 16;;
|
|
}
|
|
|
|
static inline int horizontal_find_first(Vec16sb const & x) {
|
|
return horizontal_find_first(Vec32cb(x)) >> 1;
|
|
}
|
|
|
|
static inline int horizontal_find_first(Vec8ib const & x) {
|
|
return horizontal_find_first(Vec32cb(x)) >> 2;
|
|
}
|
|
|
|
static inline int horizontal_find_first(Vec4qb const & x) {
|
|
return horizontal_find_first(Vec32cb(x)) >> 3;
|
|
}
|
|
|
|
// Count the number of elements that are true
|
|
static inline uint32_t horizontal_count(Vec32cb const & x) {
|
|
return horizontal_count(x.get_low()) + horizontal_count(x.get_high());
|
|
}
|
|
|
|
static inline uint32_t horizontal_count(Vec16sb const & x) {
|
|
return horizontal_count(Vec32cb(x)) >> 1;
|
|
}
|
|
|
|
static inline uint32_t horizontal_count(Vec8ib const & x) {
|
|
return horizontal_count(Vec32cb(x)) >> 2;
|
|
}
|
|
|
|
static inline uint32_t horizontal_count(Vec4qb const & x) {
|
|
return horizontal_count(Vec32cb(x)) >> 3;
|
|
}
|
|
|
|
/*****************************************************************************
|
|
*
|
|
* Boolean <-> bitfield conversion functions
|
|
*
|
|
*****************************************************************************/
|
|
|
|
// to_bits: convert boolean vector to integer bitfield
|
|
static inline uint32_t to_bits(Vec32cb const & x) {
|
|
return to_bits(x.get_low()) | (uint32_t)to_bits(x.get_high()) << 16;
|
|
}
|
|
|
|
// to_Vec16c: convert integer bitfield to boolean vector
|
|
static inline Vec32cb to_Vec32cb(uint32_t x) {
|
|
return Vec32c(to_Vec16cb(uint16_t(x)), to_Vec16cb(uint16_t(x>>16)));
|
|
}
|
|
|
|
// to_bits: convert boolean vector to integer bitfield
|
|
static inline uint16_t to_bits(Vec16sb const & x) {
|
|
return to_bits(x.get_low()) | (uint16_t)to_bits(x.get_high()) << 8;
|
|
}
|
|
|
|
// to_Vec16sb: convert integer bitfield to boolean vector
|
|
static inline Vec16sb to_Vec16sb(uint16_t x) {
|
|
return Vec16s(to_Vec8sb(uint8_t(x)), to_Vec8sb(uint8_t(x>>8)));
|
|
}
|
|
|
|
// to_bits: convert boolean vector to integer bitfield
|
|
static inline uint8_t to_bits(Vec8ib const & x) {
|
|
return to_bits(x.get_low()) | (uint8_t)to_bits(x.get_high()) << 4;
|
|
}
|
|
|
|
// to_Vec8ib: convert integer bitfield to boolean vector
|
|
static inline Vec8ib to_Vec8ib(uint8_t x) {
|
|
return Vec8i(to_Vec4ib(x), to_Vec4ib(x>>4));
|
|
}
|
|
|
|
// to_bits: convert boolean vector to integer bitfield
|
|
static inline uint8_t to_bits(Vec4qb const & x) {
|
|
return to_bits(x.get_low()) | to_bits(x.get_high()) << 2;
|
|
}
|
|
|
|
// to_Vec16c: convert integer bitfield to boolean vector
|
|
static inline Vec4qb to_Vec4qb(uint8_t x) {
|
|
return Vec4q(to_Vec2qb(x), to_Vec2qb(x>>2));
|
|
}
|
|
|
|
#endif // VECTORI256_H
|