ClickHouse/libs/libvectorclass/vectormath_trig.h

1042 lines
33 KiB
C
Raw Normal View History

/**************************** vectormath_trig.h ******************************
* Author: Agner Fog
* Date created: 2014-04-18
* Last modified: 2014-10-22
* Version: 1.16
* Project: vector classes
* Description:
* Header file containing inline version of trigonometric functions
* and inverse trigonometric functions
* sin, cos, sincos, tan
* asin, acos, atan, atan2
*
* Theory, methods and inspiration based partially on these sources:
* > Moshier, Stephen Lloyd Baluk: Methods and programs for mathematical functions.
* Ellis Horwood, 1989.
* > VDT library developed on CERN by Danilo Piparo, Thomas Hauth and
* Vincenzo Innocente, 2012, https://svnweb.cern.ch/trac/vdt
* > Cephes math library by Stephen L. Moshier 1992,
* http://www.netlib.org/cephes/
*
* For detailed instructions, see vectormath_common.h and VectorClass.pdf
*
* (c) Copyright 2014 GNU General Public License http://www.gnu.org/licenses
******************************************************************************/
#ifndef VECTORMATH_TRIG_H
#define VECTORMATH_TRIG_H 1
#include "vectormath_common.h"
// Different overloaded functions for template resolution.
// These are used to fix the problem that the quadrant index uses
// a vector of 32-bit integers which doesn't fit the size of the
// 64-bit double precision vector:
// VTYPE | ITYPE | ITYPEH
// -----------------------
// Vec2d | Vec2q | Vec4i
// Vec4d | Vec4q | Vec4i
// Vec8d | Vec8q | Vec8i
// define overloaded truncate functions
static inline Vec4i vm_truncate_low_to_int(Vec2d const & x) {
return truncate_to_int(x,x);
}
#if MAX_VECTOR_SIZE >= 256
static inline Vec4i vm_truncate_low_to_int(Vec4d const & x) {
return truncate_to_int(x);
}
#endif // MAX_VECTOR_SIZE >= 256
#if MAX_VECTOR_SIZE >= 512
static inline Vec8i vm_truncate_low_to_int(Vec8d const & x) {
return truncate_to_int(x);
}
#endif // MAX_VECTOR_SIZE >= 512
// define int -> double conversions
template<class VTYPE, class ITYPE>
static inline VTYPE vm_half_int_vector_to_double(ITYPE const & x);
template<>
inline Vec2d vm_half_int_vector_to_double<Vec2d, Vec4i>(Vec4i const & x) {
return to_double_low(x);
}
#if MAX_VECTOR_SIZE >= 256
template<>
inline Vec4d vm_half_int_vector_to_double<Vec4d, Vec4i>(Vec4i const & x) {
return to_double(x);
}
#endif // MAX_VECTOR_SIZE >= 256
#if MAX_VECTOR_SIZE >= 512
template<>
inline Vec8d vm_half_int_vector_to_double<Vec8d, Vec8i>(Vec8i const & x) {
return to_double(x);
}
#endif // MAX_VECTOR_SIZE >= 512
// define int32_t to int64_t conversions
template<class ITYPE, class ITYPEH>
static inline ITYPE vm_half_int_vector_to_full(ITYPEH const & x);
template<>
inline Vec2q vm_half_int_vector_to_full<Vec2q,Vec4i>(Vec4i const & x) {
return extend_low(x);
}
#if MAX_VECTOR_SIZE >= 256
template<>
inline Vec4q vm_half_int_vector_to_full<Vec4q,Vec4i>(Vec4i const & x) {
return extend_low(Vec8i(x,x));
}
#endif // MAX_VECTOR_SIZE >= 256
#if MAX_VECTOR_SIZE >= 512
template<>
inline Vec8q vm_half_int_vector_to_full<Vec8q,Vec8i>(Vec8i const & x) {
return extend_low(Vec16i(x,x));
}
#endif // MAX_VECTOR_SIZE >= 512
// *************************************************************
// sincos template, double precision
// *************************************************************
// Template parameters:
// VTYPE: f.p. vector type
// ITYPE: integer vector type with same element size
// ITYPEH: integer vector type with half the element size
// BTYPE: boolean vector type
// SC: 1 = sin, 2 = cos, 3 = sincos
// Paramterers:
// xx = input x (radians)
// cosret = return pointer (only if SC = 3)
template<class VTYPE, class ITYPE, class ITYPEH, class BTYPE, int SC>
static inline VTYPE sincos_d(VTYPE * cosret, VTYPE const & xx) {
// define constants
const double ONEOPIO4 = 4./VM_PI;
const double P0sin =-1.66666666666666307295E-1;
const double P1sin = 8.33333333332211858878E-3;
const double P2sin =-1.98412698295895385996E-4;
const double P3sin = 2.75573136213857245213E-6;
const double P4sin =-2.50507477628578072866E-8;
const double P5sin = 1.58962301576546568060E-10;
const double P0cos = 4.16666666666665929218E-2;
const double P1cos =-1.38888888888730564116E-3;
const double P2cos = 2.48015872888517045348E-5;
const double P3cos =-2.75573141792967388112E-7;
const double P4cos = 2.08757008419747316778E-9;
const double P5cos =-1.13585365213876817300E-11;
const double DP1 = 7.853981554508209228515625E-1;
const double DP2 = 7.94662735614792836714E-9;
const double DP3 = 3.06161699786838294307E-17;
/*
const double DP1sc = 7.85398125648498535156E-1;
const double DP2sc = 3.77489470793079817668E-8;
const double DP3sc = 2.69515142907905952645E-15;
*/
VTYPE xa, x, y, x2, s, c, sin1, cos1; // data vectors
ITYPEH q; // integer vectors, 32 bit
ITYPE qq, signsin, signcos; // integer vectors, 64 bit
BTYPE swap, overflow; // boolean vectors
xa = abs(xx);
// Find quadrant
// 0 - pi/4 => 0
// pi/4 - 3*pi/4 => 2
// 3*pi/4 - 5*pi/4 => 4
// 5*pi/4 - 7*pi/4 => 6
// 7*pi/4 - 8*pi/4 => 8
// truncate to integer (magic number conversion is not faster here)
q = vm_truncate_low_to_int(xa * ONEOPIO4);
q = (q + 1) & ~1;
y = vm_half_int_vector_to_double<VTYPE>(q); // quadrant, as double
// Reduce by extended precision modular arithmetic
x = nmul_add(y, DP3, nmul_add(y, DP2, nmul_add(y, DP1, xa))); // x = ((xa - y * DP1) - y * DP2) - y * DP3;
// Expansion of sin and cos, valid for -pi/4 <= x <= pi/4
x2 = x * x;
s = polynomial_5(x2, P0sin, P1sin, P2sin, P3sin, P4sin, P5sin);
c = polynomial_5(x2, P0cos, P1cos, P2cos, P3cos, P4cos, P5cos);
s = mul_add(x * x2, s, x); // s = x + (x * x2) * s;
c = mul_add(x2 * x2, c, nmul_add(x2, 0.5, 1.0)); // c = 1.0 - x2 * 0.5 + (x2 * x2) * c;
// correct for quadrant
qq = vm_half_int_vector_to_full<ITYPE,ITYPEH>(q);
swap = BTYPE((qq & 2) != 0);
// check for overflow
if (horizontal_or(q < 0)) {
overflow = (y < 0) & is_finite(xa);
s = select(overflow, 0., s);
c = select(overflow, 1., c);
}
if (SC & 1) { // calculate sin
sin1 = select(swap, c, s);
signsin = ((qq << 61) ^ ITYPE(reinterpret_i(xx))) & ITYPE(1ULL << 63);
sin1 ^= reinterpret_d(signsin);
}
if (SC & 2) { // calculate cos
cos1 = select(swap, s, c);
signcos = ((qq + 2) << 61) & (1ULL << 63);
cos1 ^= reinterpret_d(signcos);
}
if (SC == 3) { // calculate both. cos returned through pointer
*cosret = cos1;
}
if (SC & 1) return sin1; else return cos1;
}
// instantiations of sincos_d template:
static inline Vec2d sin(Vec2d const & x) {
return sincos_d<Vec2d, Vec2q, Vec4i, Vec2db, 1>(0, x);
}
static inline Vec2d cos(Vec2d const & x) {
return sincos_d<Vec2d, Vec2q, Vec4i, Vec2db, 2>(0, x);
}
static inline Vec2d sincos(Vec2d * cosret, Vec2d const & x) {
return sincos_d<Vec2d, Vec2q, Vec4i, Vec2db, 3>(cosret, x);
}
#if MAX_VECTOR_SIZE >= 256
static inline Vec4d sin(Vec4d const & x) {
return sincos_d<Vec4d, Vec4q, Vec4i, Vec4db, 1>(0, x);
}
static inline Vec4d cos(Vec4d const & x) {
return sincos_d<Vec4d, Vec4q, Vec4i, Vec4db, 2>(0, x);
}
static inline Vec4d sincos(Vec4d * cosret, Vec4d const & x) {
return sincos_d<Vec4d, Vec4q, Vec4i, Vec4db, 3>(cosret, x);
}
#endif // MAX_VECTOR_SIZE >= 256
#if MAX_VECTOR_SIZE >= 512
static inline Vec8d sin(Vec8d const & x) {
return sincos_d<Vec8d, Vec8q, Vec8i, Vec8db, 1>(0, x);
}
static inline Vec8d cos(Vec8d const & x) {
return sincos_d<Vec8d, Vec8q, Vec8i, Vec8db, 2>(0, x);
}
static inline Vec8d sincos(Vec8d * cosret, Vec8d const & x) {
return sincos_d<Vec8d, Vec8q, Vec8i, Vec8db, 3>(cosret, x);
}
#endif // MAX_VECTOR_SIZE >= 512
// *************************************************************
// sincos template, single precision
// *************************************************************
// Template parameters:
// VTYPE: f.p. vector type
// ITYPE: integer vector type with same element size
// BTYPE: boolean vector type
// SC: 1 = sin, 2 = cos, 3 = sincos, 4 = tan
// Paramterers:
// xx = input x (radians)
// cosret = return pointer (only if SC = 3)
template<class VTYPE, class ITYPE, class BTYPE, int SC>
static inline VTYPE sincos_f(VTYPE * cosret, VTYPE const & xx) {
// define constants
const float ONEOPIO4f = (float)(4./VM_PI);
const float DP1F = 0.78515625f;
const float DP2F = 2.4187564849853515625E-4f;
const float DP3F = 3.77489497744594108E-8f;
const float P0sinf = -1.6666654611E-1f;
const float P1sinf = 8.3321608736E-3f;
const float P2sinf = -1.9515295891E-4f;
const float P0cosf = 4.166664568298827E-2f;
const float P1cosf = -1.388731625493765E-3f;
const float P2cosf = 2.443315711809948E-5f;
VTYPE xa, x, y, x2, s, c, sin1, cos1; // data vectors
ITYPE q, signsin, signcos; // integer vectors
BTYPE swap, overflow; // boolean vectors
xa = abs(xx);
// Find quadrant
// 0 - pi/4 => 0
// pi/4 - 3*pi/4 => 2
// 3*pi/4 - 5*pi/4 => 4
// 5*pi/4 - 7*pi/4 => 6
// 7*pi/4 - 8*pi/4 => 8
q = truncate_to_int(xa * ONEOPIO4f);
q = (q + 1) & ~1;
y = to_float(q); // quadrant, as float
// Reduce by extended precision modular arithmetic
x = nmul_add(y, DP3F, nmul_add(y, DP2F, nmul_add(y, DP1F, xa))); // x = ((xa - y * DP1F) - y * DP2F) - y * DP3F;
// A two-step reduction saves time at the cost of precision for very big x:
//x = (xa - y * DP1F) - y * (DP2F+DP3F);
// Taylor expansion of sin and cos, valid for -pi/4 <= x <= pi/4
x2 = x * x;
s = polynomial_2(x2, P0sinf, P1sinf, P2sinf) * (x*x2) + x;
c = polynomial_2(x2, P0cosf, P1cosf, P2cosf) * (x2*x2) + nmul_add(0.5f, x2, 1.0f);
// correct for quadrant
swap = BTYPE((q & 2) != 0);
// check for overflow
overflow = BTYPE(q < 0); // q = 0x80000000 if overflow
if (horizontal_or(overflow & is_finite(xa))) {
s = select(overflow, 0.f, s);
c = select(overflow, 1.f, c);
}
if (SC & 5) { // calculate sin
sin1 = select(swap, c, s);
signsin = ((q << 29) ^ ITYPE(reinterpret_i(xx))) & ITYPE(1 << 31);
sin1 ^= reinterpret_f(signsin);
}
if (SC & 6) { // calculate cos
cos1 = select(swap, s, c);
signcos = ((q + 2) << 29) & (1 << 31);
cos1 ^= reinterpret_f(signcos);
}
if (SC == 1) return sin1;
else if (SC == 2) return cos1;
else if (SC == 3) { // calculate both. cos returned through pointer
*cosret = cos1;
return sin1;
}
else /*if (SC == 4)*/ return sin1 / cos1;
}
// instantiations of sincos_f template:
static inline Vec4f sin(Vec4f const & x) {
return sincos_f<Vec4f, Vec4i, Vec4fb, 1>(0, x);
}
static inline Vec4f cos(Vec4f const & x) {
return sincos_f<Vec4f, Vec4i, Vec4fb, 2>(0, x);
}
static inline Vec4f sincos(Vec4f * cosret, Vec4f const & x) {
return sincos_f<Vec4f, Vec4i, Vec4fb, 3>(cosret, x);
}
static inline Vec4f tan(Vec4f const & x) {
return sincos_f<Vec4f, Vec4i, Vec4fb, 4>(0, x);
}
#if MAX_VECTOR_SIZE >= 256
static inline Vec8f sin(Vec8f const & x) {
return sincos_f<Vec8f, Vec8i, Vec8fb, 1>(0, x);
}
static inline Vec8f cos(Vec8f const & x) {
return sincos_f<Vec8f, Vec8i, Vec8fb, 2>(0, x);
}
static inline Vec8f sincos(Vec8f * cosret, Vec8f const & x) {
return sincos_f<Vec8f, Vec8i, Vec8fb, 3>(cosret, x);
}
static inline Vec8f tan(Vec8f const & x) {
return sincos_f<Vec8f, Vec8i, Vec8fb, 4>(0, x);
}
#endif // MAX_VECTOR_SIZE >= 256
#if MAX_VECTOR_SIZE >= 512
static inline Vec16f sin(Vec16f const & x) {
return sincos_f<Vec16f, Vec16i, Vec16fb, 1>(0, x);
}
static inline Vec16f cos(Vec16f const & x) {
return sincos_f<Vec16f, Vec16i, Vec16fb, 2>(0, x);
}
static inline Vec16f sincos(Vec16f * cosret, Vec16f const & x) {
return sincos_f<Vec16f, Vec16i, Vec16fb, 3>(cosret, x);
}
static inline Vec16f tan(Vec16f const & x) {
return sincos_f<Vec16f, Vec16i, Vec16fb, 4>(0, x);
}
#endif // MAX_VECTOR_SIZE >= 512
// *************************************************************
// tan template, double precision
// *************************************************************
// Template parameters:
// VTYPE: f.p. vector type
// ITYPE: integer vector type with same element size
// ITYPEH: integer vector type with half the element size
// BTYPE: boolean vector type
// Paramterers:
// x = input x (radians)
template<class VTYPE, class ITYPE, class ITYPEH, class BTYPE>
static inline VTYPE tan_d(VTYPE const & x) {
// define constants
const double ONEOPIO4 = 4./VM_PI;
const double DP1 = 7.853981554508209228515625E-1;
const double DP2 = 7.94662735614792836714E-9;
const double DP3 = 3.06161699786838294307E-17;
const double P2tan=-1.30936939181383777646E4;
const double P1tan=1.15351664838587416140E6;
const double P0tan=-1.79565251976484877988E7;
const double Q3tan = 1.36812963470692954678E4;
const double Q2tan = -1.32089234440210967447E6;
const double Q1tan = 2.50083801823357915839E7;
const double Q0tan = -5.38695755929454629881E7;
VTYPE xa, y, z, zz, px, qx, tn, recip; // data vectors
ITYPEH q; // integer vector, 32 bit
ITYPE qq; // integer vector, 64 bit
BTYPE doinvert, xzero, overflow; // boolean vectors
xa = abs(x);
// Find quadrant
// 0 - pi/4 => 0
// pi/4 - 3*pi/4 => 2
// 3*pi/4 - 5*pi/4 => 4
// 5*pi/4 - 7*pi/4 => 6
// 7*pi/4 - 8*pi/4 => 8
q = vm_truncate_low_to_int(xa * ONEOPIO4);
q = (q + 1) & ~1;
y = vm_half_int_vector_to_double<VTYPE>(q); // quadrant, as double
// Reduce by extended precision modular arithmetic
z = nmul_add(y, DP3, nmul_add(y, DP2, nmul_add(y, DP1, xa))); //z = ((xa - y * DP1) - y * DP2) - y * DP3;
// Pade expansion of tan, valid for -pi/4 <= x <= pi/4
zz = z * z;
px = polynomial_2 (zz, P0tan, P1tan, P2tan);
qx = polynomial_4n(zz, Q0tan, Q1tan, Q2tan, Q3tan);
// qx cannot be 0 for x <= pi/4
tn = mul_add(px / qx, z * zz, z); // tn = z + z * zz * px / qx;
// if (q&2) tn = -1/tn
qq = vm_half_int_vector_to_full<ITYPE,ITYPEH>(q);
doinvert = BTYPE((qq & 2) != 0);
xzero = (xa == 0.);
// avoid division by 0. We will not be using recip anyway if xa == 0.
// tn never becomes exactly 0 when x = pi/2 so we only have to make
// a special case for x == 0.
recip = (-1.) / select(xzero, VTYPE(-1.), tn);
tn = select(doinvert, recip, tn);
tn = sign_combine(tn, x); // get original sign
// check for overflow
if (horizontal_or(q < 0)) {
overflow = (y < 0) & is_finite(xa);
tn = select(overflow, 0., tn);
}
return tn;
}
// instantiations of tan_d template:
static inline Vec2d tan(Vec2d const & x) {
return tan_d<Vec2d, Vec2q, Vec4i, Vec2db>(x);
}
#if MAX_VECTOR_SIZE >= 256
static inline Vec4d tan(Vec4d const & x) {
return tan_d<Vec4d, Vec4q, Vec4i, Vec4db>(x);
}
#endif // MAX_VECTOR_SIZE >= 256
#if MAX_VECTOR_SIZE >= 512
static inline Vec8d tan(Vec8d const & x) {
return tan_d<Vec8d, Vec8q, Vec8i, Vec8db>(x);
}
#endif // MAX_VECTOR_SIZE >= 512
/*
This is removed for the single precision version.
It is faster to use tan(x) = sin(x)/cos(x)
// *************************************************************
// tan template, single precision
// *************************************************************
// Template parameters:
// VTYPE: f.p. vector type
// ITYPE: integer vector type with same element size
// BTYPE: boolean vector type
// Paramterers:
// x = input x (radians)
// cosret = return pointer (only if SC = 3)
template<class VTYPE, class ITYPE, class BTYPE>
static inline VTYPE tan_f(VTYPE const & x) {
// define constants
const float ONEOPIO4f = (float)(4./VM_PI);
const float DP1F = 0.78515625f;
const float DP2F = 2.4187564849853515625E-4f;
const float DP3F = 3.77489497744594108E-8f;
const float P5tanf = 9.38540185543E-3f;
const float P4tanf = 3.11992232697E-3f;
const float P3tanf = 2.44301354525E-2f;
const float P2tanf = 5.34112807005E-2f;
const float P1tanf = 1.33387994085E-1f;
const float P0tanf = 3.33331568548E-1f;
VTYPE xa, y, z, zz, tn, recip; // data vectors
ITYPE q; // integer vector
BTYPE doinvert, xzero; // boolean vectors
xa = abs(x);
// Find quadrant
// 0 - pi/4 => 0
// pi/4 - 3*pi/4 => 2
// 3*pi/4 - 5*pi/4 => 4
// 5*pi/4 - 7*pi/4 => 6
// 7*pi/4 - 8*pi/4 => 8
q = truncate_to_int(xa * ONEOPIO4f);
q = (q + 1) & ~1;
y = to_float(q); // quadrant, as float
// Reduce by extended precision modular arithmetic
z = ((xa - y * DP1F) - y * DP2F) - y * DP3F;
//z = (xa - y * DP1F) - y * (DP2F + DP3F);
zz = z * z;
// Taylor expansion
tn = polynomial_5(zz, P0tanf, P1tanf, P2tanf, P3tanf, P4tanf, P5tanf) * (zz * z) + z;
// if (q&2) tn = -1/tn
doinvert = (q & 2) != 0;
xzero = (xa == 0.f);
// avoid division by 0. We will not be using recip anyway if xa == 0.
// tn never becomes exactly 0 when x = pi/2 so we only have to make
// a special case for x == 0.
recip = (-1.f) / select(xzero, VTYPE(-1.f), tn);
tn = select(doinvert, recip, tn);
tn = sign_combine(tn, x); // get original sign
return tn;
}
// instantiations of tan_f template:
static inline Vec4f tan(Vec4f const & x) {
return tan_f<Vec4f, Vec4i, Vec4fb>(x);
}
static inline Vec8f tan(Vec8f const & x) {
return tan_f<Vec8f, Vec8i, Vec8fb>(x);
}
*/
// *************************************************************
// asin/acos template, double precision
// *************************************************************
// Template parameters:
// VTYPE: f.p. vector type
// BTYPE: boolean vector type
// AC: 0 = asin, 1 = acos
// Paramterers:
// x = input x
template<class VTYPE, class BTYPE, int AC>
static inline VTYPE asin_d(VTYPE const & x) {
// define constants
const double R4asin = 2.967721961301243206100E-3;
const double R3asin = -5.634242780008963776856E-1;
const double R2asin = 6.968710824104713396794E0;
const double R1asin = -2.556901049652824852289E1;
const double R0asin = 2.853665548261061424989E1;
const double S3asin = -2.194779531642920639778E1;
const double S2asin = 1.470656354026814941758E2;
const double S1asin = -3.838770957603691357202E2;
const double S0asin = 3.424398657913078477438E2;
const double P5asin = 4.253011369004428248960E-3;
const double P4asin = -6.019598008014123785661E-1;
const double P3asin = 5.444622390564711410273E0;
const double P2asin = -1.626247967210700244449E1;
const double P1asin = 1.956261983317594739197E1;
const double P0asin = -8.198089802484824371615E0;
const double Q4asin = -1.474091372988853791896E1;
const double Q3asin = 7.049610280856842141659E1;
const double Q2asin = -1.471791292232726029859E2;
const double Q1asin = 1.395105614657485689735E2;
const double Q0asin = -4.918853881490881290097E1;
VTYPE xa, xb, x1, x2, x3, x4, x5, px, qx, rx, sx, vx, wx, y1, yb, z, z1, z2;
BTYPE big;
bool dobig, dosmall;
xa = abs(x);
big = xa >= 0.625;
/*
Small: xa < 0.625
------------------
x = xa * xa;
px = PX(x);
qx = QX(x);
y1 = x*px/qx;
y1 = xa * y1 + xa;
Big: xa >= 0.625
------------------
x = 1.0 - xa;
rx = RX(x);
sx = SX(x);
y1 = x * rx/sx;
x3 = sqrt(x+x);
y3 = x3 * y1 - MOREBITS;
z = pi/2 - x3 - y3
*/
// select a common x for all polynomials
// This allows sharing of powers of x through common subexpression elimination
x1 = select(big, 1.0 - xa, xa * xa);
// calculate powers of x1 outside branches to make sure they are only calculated once
x2 = x1 * x1;
x4 = x2 * x2;
x5 = x4 * x1;
x3 = x2 * x1;
dosmall = !horizontal_and(big); // at least one element is small
dobig = horizontal_or(big) ; // at least one element is big
// calculate polynomials (reuse powers of x)
if (dosmall) {
// px = polynomial_5 (x1, P0asin, P1asin, P2asin, P3asin, P4asin, P5asin);
// qx = polynomial_5n(x1, Q0asin, Q1asin, Q2asin, Q3asin, Q4asin);
px = mul_add(x3,P3asin,P0asin) + mul_add(x4,P4asin,x1*P1asin) + mul_add(x5,P5asin,x2*P2asin);
qx = mul_add(x4,Q4asin,x5) + mul_add(x3,Q3asin,x1*Q1asin) + mul_add(x2,Q2asin,Q0asin);
}
if (dobig) {
// rx = polynomial_4 (x1, R0asin, R1asin, R2asin, R3asin, R4asin);
// sx = polynomial_4n(x1, S0asin, S1asin, S2asin, S3asin);
rx = mul_add(x3,R3asin,x2*R2asin) + mul_add(x4,R4asin,mul_add(x1,R1asin,R0asin));
sx = mul_add(x3,S3asin,x4) + mul_add(x2,S2asin,mul_add(x1,S1asin,S0asin));
}
// select and divide outside branches to avoid dividing twice
vx = select(big, rx, px);
wx = select(big, sx, qx);
y1 = vx/wx * x1;
// results for big
if (dobig) { // avoid square root if all are small
xb = sqrt(x1+x1); // this produces NAN if xa > 1 so we don't need a special case for xa > 1
z1 = mul_add(xb, y1, xb); // yb = xb * y1; z1 = xb + yb;
}
// results for small
z2 = mul_add(xa, y1, xa); // z2 = xa * y1 + xa;
// correct for sign
if (AC) { // acos
z1 = select(x < 0., VM_PI - z1, z1);
z2 = VM_PI_2 - sign_combine(z2, x);
z = select(big, z1, z2);
}
else { // asin
z1 = VM_PI_2 - z1;
z = select(big, z1, z2);
z = sign_combine(z, x);
}
return z;
}
// instantiations of asin_d template:
static inline Vec2d asin(Vec2d const & x) {
return asin_d<Vec2d, Vec2db, 0>(x);
}
static inline Vec2d acos(Vec2d const & x) {
return asin_d<Vec2d, Vec2db, 1>(x);
}
#if MAX_VECTOR_SIZE >= 256
static inline Vec4d asin(Vec4d const & x) {
return asin_d<Vec4d, Vec4db, 0>(x);
}
static inline Vec4d acos(Vec4d const & x) {
return asin_d<Vec4d, Vec4db, 1>(x);
}
#endif // MAX_VECTOR_SIZE >= 256
#if MAX_VECTOR_SIZE >= 512
static inline Vec8d asin(Vec8d const & x) {
return asin_d<Vec8d, Vec8db, 0>(x);
}
static inline Vec8d acos(Vec8d const & x) {
return asin_d<Vec8d, Vec8db, 1>(x);
}
#endif // MAX_VECTOR_SIZE >= 512
// *************************************************************
// asin/acos template, single precision
// *************************************************************
// Template parameters:
// VTYPE: f.p. vector type
// BTYPE: boolean vector type
// AC: 0 = asin, 1 = acos
// Paramterers:
// x = input x
template<class VTYPE, class BTYPE, int AC>
static inline VTYPE asin_f(VTYPE const & x) {
// define constants
const float P4asinf = 4.2163199048E-2f;
const float P3asinf = 2.4181311049E-2f;
const float P2asinf = 4.5470025998E-2f;
const float P1asinf = 7.4953002686E-2f;
const float P0asinf = 1.6666752422E-1f;
VTYPE xa, x1, x2, x3, x4, xb, z, z1, z2;
BTYPE big;
xa = abs(x);
big = xa > 0.5f;
x1 = 0.5f * (1.0f - xa);
x2 = xa * xa;
x3 = select(big, x1, x2);
//if (horizontal_or(big))
{
xb = sqrt(x1);
}
x4 = select(big, xb, xa);
z = polynomial_4(x3, P0asinf, P1asinf, P2asinf, P3asinf, P4asinf);
z = mul_add(z, x3*x4, x4); // z = z * (x3*x4) + x4;
z1 = z + z;
// correct for sign
if (AC) { // acos
z1 = select(x < 0., float(VM_PI) - z1, z1);
z2 = float(VM_PI_2) - sign_combine(z, x);
z = select(big, z1, z2);
}
else { // asin
z1 = float(VM_PI_2) - z1;
z = select(big, z1, z);
z = sign_combine(z, x);
}
return z;
}
// instantiations of asin_f template:
static inline Vec4f asin(Vec4f const & x) {
return asin_f<Vec4f, Vec4fb, 0>(x);
}
static inline Vec4f acos(Vec4f const & x) {
return asin_f<Vec4f, Vec4fb, 1>(x);
}
#if MAX_VECTOR_SIZE >= 256
static inline Vec8f asin(Vec8f const & x) {
return asin_f<Vec8f, Vec8fb, 0>(x);
}
static inline Vec8f acos(Vec8f const & x) {
return asin_f<Vec8f, Vec8fb, 1>(x);
}
#endif // MAX_VECTOR_SIZE >= 256
#if MAX_VECTOR_SIZE >= 512
static inline Vec16f asin(Vec16f const & x) {
return asin_f<Vec16f, Vec16fb, 0>(x);
}
static inline Vec16f acos(Vec16f const & x) {
return asin_f<Vec16f, Vec16fb, 1>(x);
}
#endif // MAX_VECTOR_SIZE >= 512
// *************************************************************
// atan template, double precision
// *************************************************************
// Template parameters:
// VTYPE: f.p. vector type
// BTYPE: boolean vector type
// T2: 0 = atan, 1 = atan2
// Paramterers:
// y, x. calculate tan(y/x)
// result is between -pi/2 and +pi/2 when x > 0
// result is between -pi and -pi/2 or between pi/2 and pi when x < 0 for atan2
// atan2(0,0) gives NAN. Future versions may give 0
template<class VTYPE, class BTYPE, int T2>
static inline VTYPE atan_d(VTYPE const & y, VTYPE const & x) {
// define constants
//const double ONEOPIO4 = 4./VM_PI;
const double MOREBITS = 6.123233995736765886130E-17;
const double MOREBITSO2 = MOREBITS * 0.5;
const double T3PO8 = VM_SQRT2 + 1.; // 2.41421356237309504880;
const double P4atan = -8.750608600031904122785E-1;
const double P3atan = -1.615753718733365076637E1;
const double P2atan = -7.500855792314704667340E1;
const double P1atan = -1.228866684490136173410E2;
const double P0atan = -6.485021904942025371773E1;
const double Q4atan = 2.485846490142306297962E1;
const double Q3atan = 1.650270098316988542046E2;
const double Q2atan = 4.328810604912902668951E2;
const double Q1atan = 4.853903996359136964868E2;
const double Q0atan = 1.945506571482613964425E2;
VTYPE t, x1, x2, y1, y2, s, fac, a, b, z, zz, px, qx, re; // data vectors
BTYPE swapxy, notbig, notsmal; // boolean vectors
if (T2) { // atan2(y,x)
// move in first octant
x1 = abs(x);
y1 = abs(y);
swapxy = (y1 > x1);
// swap x and y if y1 > x1
x2 = select(swapxy, y1, x1);
y2 = select(swapxy, x1, y1);
t = y2 / x2; // x = y = 0 gives NAN here
}
else { // atan(y)
t = abs(y);
}
// small: t < 0.66
// medium: 0.66 <= t <= 2.4142 (1+sqrt(2))
// big: t > 2.4142
notbig = t <= T3PO8; // t <= 2.4142
notsmal = t >= 0.66; // t >= 0.66
s = select(notbig, VTYPE(VM_PI_4), VTYPE(VM_PI_2));
s = notsmal & s; // select(notsmal, s, 0.);
fac = select(notbig, VTYPE(MOREBITSO2), VTYPE(MOREBITS));
fac = notsmal & fac; //select(notsmal, fac, 0.);
// small: z = t / 1.0;
// medium: z = (t-1.0) / (t+1.0);
// big: z = -1.0 / t;
a = notbig & t; // select(notbig, t, 0.);
a = if_add(notsmal, a, -1.);
b = notbig & VTYPE(1.); // select(notbig, 1., 0.);
b = if_add(notsmal, b, t);
z = a / b; // division by 0 will not occur unless x and y are both 0
zz = z * z;
px = polynomial_4 (zz, P0atan, P1atan, P2atan, P3atan, P4atan);
qx = polynomial_5n(zz, Q0atan, Q1atan, Q2atan, Q3atan, Q4atan);
re = mul_add(px / qx, z * zz, z); // re = (px / qx) * (z * zz) + z;
re += s + fac;
if (T2) { // atan2(y,x)
// move back in place
re = select(swapxy, VM_PI_2 - re, re);
re = select(x < 0., VM_PI - re, re);
re = select((x | y) == 0., 0., re); // atan2(0,0) = 0 by convention
}
// get sign bit
re = sign_combine(re, y);
return re;
}
// instantiations of atan_d template:
static inline Vec2d atan2(Vec2d const & y, Vec2d const & x) {
return atan_d<Vec2d, Vec2db, 1>(y, x);
}
static inline Vec2d atan(Vec2d const & y) {
return atan_d<Vec2d, Vec2db, 0>(y, 0.);
}
#if MAX_VECTOR_SIZE >= 256
static inline Vec4d atan2(Vec4d const & y, Vec4d const & x) {
return atan_d<Vec4d, Vec4db, 1>(y, x);
}
static inline Vec4d atan(Vec4d const & y) {
return atan_d<Vec4d, Vec4db, 0>(y, 0.);
}
#endif // MAX_VECTOR_SIZE >= 256
#if MAX_VECTOR_SIZE >= 512
static inline Vec8d atan2(Vec8d const & y, Vec8d const & x) {
return atan_d<Vec8d, Vec8db, 1>(y, x);
}
static inline Vec8d atan(Vec8d const & y) {
return atan_d<Vec8d, Vec8db, 0>(y, 0.);
}
#endif // MAX_VECTOR_SIZE >= 512
// *************************************************************
// atan template, single precision
// *************************************************************
// Template parameters:
// VTYPE: f.p. vector type
// BTYPE: boolean vector type
// T2: 0 = atan, 1 = atan2
// Paramterers:
// y, x. calculate tan(y/x)
// result is between -pi/2 and +pi/2 when x > 0
// result is between -pi and -pi/2 or between pi/2 and pi when x < 0 for atan2
// atan2(0,0) gives NAN. Future versions may give 0
template<class VTYPE, class BTYPE, int T2>
static inline VTYPE atan_f(VTYPE const & y, VTYPE const & x) {
// define constants
const float P3atanf = 8.05374449538E-2f;
const float P2atanf = -1.38776856032E-1f;
const float P1atanf = 1.99777106478E-1f;
const float P0atanf = -3.33329491539E-1f;
VTYPE t, x1, x2, y1, y2, s, a, b, z, zz, re; // data vectors
BTYPE swapxy, notbig, notsmal; // boolean vectors
if (T2) { // atan2(y,x)
// move in first octant
x1 = abs(x);
y1 = abs(y);
swapxy = (y1 > x1);
// swap x and y if y1 > x1
x2 = select(swapxy, y1, x1);
y2 = select(swapxy, x1, y1);
// do we need to protect against x = y = 0? It will just produce NAN, probably without delay
t = y2 / x2;
}
else { // atan(y)
t = abs(y);
}
// small: t < 0.4142
// medium: 0.4142 <= t <= 2.4142
// big: t > 2.4142 (not for atan2)
if (!T2) { // atan(y)
notsmal = t >= float(VM_SQRT2-1.); // t >= tan pi/8
notbig = t <= float(VM_SQRT2+1.); // t <= tan 3pi/8
s = select(notbig, VTYPE(float(VM_PI_4)), VTYPE(float(VM_PI_2)));
s = notsmal & s; // select(notsmal, s, 0.);
// small: z = t / 1.0;
// medium: z = (t-1.0) / (t+1.0);
// big: z = -1.0 / t;
a = notbig & t; // select(notbig, t, 0.);
a = if_add(notsmal, a, -1.f);
b = notbig & VTYPE(1.f); // select(notbig, 1., 0.);
b = if_add(notsmal, b, t);
z = a / b; // division by 0 will not occur unless x and y are both 0
}
else { // atan2(y,x)
// small: z = t / 1.0;
// medium: z = (t-1.0) / (t+1.0);
notsmal = t >= float(VM_SQRT2-1.);
a = if_add(notsmal, t, -1.f);
b = if_add(notsmal, 1.f, t);
s = notsmal & VTYPE(float(VM_PI_4));
z = a / b;
}
zz = z * z;
// Taylor expansion
re = polynomial_3(zz, P0atanf, P1atanf, P2atanf, P3atanf);
re = mul_add(re, zz * z, z) + s;
if (T2) { // atan2(y,x)
// move back in place
re = select(swapxy, float(VM_PI_2) - re, re);
re = select(x < 0., float(VM_PI) - re, re);
re = select((x | y) == 0.f, 0.f, re); // atan2(0,0) = 0 by convention
}
// get sign bit
re = sign_combine(re, y);
return re;
}
// instantiations of atan_f template:
static inline Vec4f atan2(Vec4f const & y, Vec4f const & x) {
return atan_f<Vec4f, Vec4fb, 1>(y, x);
}
static inline Vec4f atan(Vec4f const & y) {
return atan_f<Vec4f, Vec4fb, 0>(y, 0.);
}
#if MAX_VECTOR_SIZE >= 256
static inline Vec8f atan2(Vec8f const & y, Vec8f const & x) {
return atan_f<Vec8f, Vec8fb, 1>(y, x);
}
static inline Vec8f atan(Vec8f const & y) {
return atan_f<Vec8f, Vec8fb, 0>(y, 0.);
}
#endif // MAX_VECTOR_SIZE >= 256
#if MAX_VECTOR_SIZE >= 512
static inline Vec16f atan2(Vec16f const & y, Vec16f const & x) {
return atan_f<Vec16f, Vec16fb, 1>(y, x);
}
static inline Vec16f atan(Vec16f const & y) {
return atan_f<Vec16f, Vec16fb, 0>(y, 0.);
}
#endif // MAX_VECTOR_SIZE >= 512
#endif