ClickHouse/contrib/libvectorclass/vectormath_exp.h

/****************************  vectormath_exp.h   ******************************
* Author:        Agner Fog
* Date created:  2014-04-18
* Last modified: 2014-10-22
* Version:       1.16
* Project:       vector classes
* Description:
* Header file containing inline vector functions of logarithms, exponential
* and power functions:
* exp         exponential function
* exmp1       exponential function minus 1
* log         natural logarithm
* log1p       natural logarithm of 1+x
* cbrt        cube root
* pow         raise vector elements to power
* pow_ratio   raise vector elements to rational power
*
* Theory, methods and inspiration based partially on these sources:
* > Moshier, Stephen Lloyd Baluk: Methods and programs for mathematical functions.
*   Ellis Horwood, 1989.
* > VDT library developed on CERN by Danilo Piparo, Thomas Hauth and
*   Vincenzo Innocente, 2012, https://svnweb.cern.ch/trac/vdt
* > Cephes math library by Stephen L. Moshier 1992,
*   http://www.netlib.org/cephes/
*
* For detailed instructions, see vectormath_common.h and VectorClass.pdf
*
* (c) Copyright 2014 GNU General Public License http://www.gnu.org/licenses
******************************************************************************/

#ifndef VECTORMATH_EXP_H
#define VECTORMATH_EXP_H  1

#include "vectormath_common.h"


/******************************************************************************
*                 Exponential functions
******************************************************************************/

// Helper functions, used internally:

// This function calculates pow(2,n) where n must be an integer. Does not check for overflow or underflow
static inline Vec2d vm_pow2n (Vec2d const & n) {
    const double pow2_52 = 4503599627370496.0;   // 2^52
    const double bias = 1023.0;                  // bias in exponent
    Vec2d a = n + (bias + pow2_52);              // put n + bias in least significant bits
    Vec2q b = reinterpret_i(a);                  // bit-cast to integer
    Vec2q c = b << 52;                           // shift left 52 places to get into exponent field
    Vec2d d = reinterpret_d(c);                  // bit-cast back to double
    return d;
}

static inline Vec4f vm_pow2n (Vec4f const & n) {
    const float pow2_23 =  8388608.0;            // 2^23
    const float bias = 127.0;                    // bias in exponent
    Vec4f a = n + (bias + pow2_23);              // put n + bias in least significant bits
    Vec4i b = reinterpret_i(a);                  // bit-cast to integer
    Vec4i c = b << 23;                           // shift left 23 places to get into exponent field
    Vec4f d = reinterpret_f(c);                  // bit-cast back to float
    return d;
}

#if MAX_VECTOR_SIZE >= 256

static inline Vec4d vm_pow2n (Vec4d const & n) {
    const double pow2_52 = 4503599627370496.0;   // 2^52
    const double bias = 1023.0;                  // bias in exponent
    Vec4d a = n + (bias + pow2_52);              // put n + bias in least significant bits
    Vec4q b = reinterpret_i(a);                  // bit-cast to integer
    Vec4q c = b << 52;                           // shift left 52 places to get value into exponent field
    Vec4d d = reinterpret_d(c);                  // bit-cast back to double
    return d;
}

static inline Vec8f vm_pow2n (Vec8f const & n) {
    const float pow2_23 =  8388608.0;            // 2^23
    const float bias = 127.0;                    // bias in exponent
    Vec8f a = n + (bias + pow2_23);              // put n + bias in least significant bits
    Vec8i b = reinterpret_i(a);                  // bit-cast to integer
    Vec8i c = b << 23;                           // shift left 23 places to get into exponent field
    Vec8f d = reinterpret_f(c);                  // bit-cast back to float
    return d;
}

#endif // MAX_VECTOR_SIZE >= 256

#if MAX_VECTOR_SIZE >= 512

static inline Vec8d vm_pow2n (Vec8d const & n) {
    const double pow2_52 = 4503599627370496.0;   // 2^52
    const double bias = 1023.0;                  // bias in exponent
    Vec8d a = n + (bias + pow2_52);              // put n + bias in least significant bits
    Vec8q b = Vec8q(reinterpret_i(a));           // bit-cast to integer
    Vec8q c = b << 52;                           // shift left 52 places to get value into exponent field
    Vec8d d = Vec8d(reinterpret_d(c));           // bit-cast back to double
    return d;
}

static inline Vec16f vm_pow2n (Vec16f const & n) {
    const float pow2_23 =  8388608.0;            // 2^23
    const float bias = 127.0;                    // bias in exponent
    Vec16f a = n + (bias + pow2_23);             // put n + bias in least significant bits
    Vec16i b = Vec16i(reinterpret_i(a));         // bit-cast to integer
    Vec16i c = b << 23;                          // shift left 23 places to get into exponent field
    Vec16f d = Vec16f(reinterpret_f(c));         // bit-cast back to float
    return d;
}

#endif // MAX_VECTOR_SIZE >= 512


// Template for exp function, double precision
// The limit of abs(x) is defined by max_x below
// This function does not produce denormals
// Template parameters:
// VTYPE: double vector type
// BTYPE: boolean vector type
// M1: 0 for exp, 1 for expm1
// BA: 0 for exp, 1 for 0.5*exp, 2 for pow(2,x), 10 for pow(10,x)

#if 1  // choose method

// Taylor expansion
template<class VTYPE, class BTYPE, int M1, int BA>
static inline VTYPE exp_d(VTYPE const & initial_x) {

    // Taylor coefficients, 1/n!
    // Not using minimax approximation because we prioritize precision close to x = 0
    const double p2  = 1./2.;
    const double p3  = 1./6.;
    const double p4  = 1./24.;
    const double p5  = 1./120.;
    const double p6  = 1./720.;
    const double p7  = 1./5040.;
    const double p8  = 1./40320.;
    const double p9  = 1./362880.;
    const double p10 = 1./3628800.;
    const double p11 = 1./39916800.;
    const double p12 = 1./479001600.;
    const double p13 = 1./6227020800.;

    // maximum abs(x), value depends on BA, defined below
    // The lower limit of x is slightly more restrictive than the upper limit.
    // We are specifying the lower limit, except for BA = 1 because it is not used for negative x
    double max_x;

    // data vectors
    VTYPE x, r, z, n2;
    BTYPE inrange;                               // boolean vector

    if (BA <= 1) { // exp(x)
        max_x = BA == 0 ? 708.39 : 709.7; // lower limit for 0.5*exp(x) is -707.6, but we are using 0.5*exp(x) only for positive x in hyperbolic functions
        const double ln2d_hi = 0.693145751953125;
        const double ln2d_lo = 1.42860682030941723212E-6;
        x  = initial_x;
        r  = round(initial_x*VM_LOG2E);
        // subtraction in two steps for higher precision
        x = nmul_add(r, ln2d_hi, x);             //  x -= r * ln2d_hi;
        x = nmul_add(r, ln2d_lo, x);             //  x -= r * ln2d_lo;
    }
    else if (BA == 2) { // pow(2,x)
        max_x = 1022.0;
        r  = round(initial_x);
        x  = initial_x - r;
        x *= VM_LN2;
    }
    else if (BA == 10) { // pow(10,x)
        max_x = 307.65;
        const double log10_2_hi = 0.30102999554947019;     // log10(2) in two parts
        const double log10_2_lo = 1.1451100899212592E-10;
        x  = initial_x;
        r  = round(initial_x*(VM_LOG2E*VM_LN10));
        // subtraction in two steps for higher precision
        x  = nmul_add(r, log10_2_hi, x);         //  x -= r * log10_2_hi;
        x  = nmul_add(r, log10_2_lo, x);         //  x -= r * log10_2_lo;
        x *= VM_LN10;
    }
    else  {  // undefined value of BA
        return 0.;
    }

    z = polynomial_13m(x, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13);

    if (BA == 1) r--;  // 0.5 * exp(x)

    // multiply by power of 2
    n2 = vm_pow2n(r);

    if (M1 == 0) {
        // exp
        z = (z + 1.0) * n2;
    }
    else {
        // expm1
        z = mul_add(z, n2, n2 - 1.0);            // z = z * n2 + (n2 - 1.0);
    }

    // check for overflow
    inrange  = abs(initial_x) < max_x;
    // check for INF and NAN
    inrange &= is_finite(initial_x);

    if (horizontal_and(inrange)) {
        // fast normal path
        return z;
    }
    else {
        // overflow, underflow and NAN
        r = select(sign_bit(initial_x), 0.-M1, infinite_vec<VTYPE>()); // value in case of +/- overflow or INF
        z = select(inrange, z, r);                                     // +/- underflow
        z = select(is_nan(initial_x), initial_x, z);                   // NAN goes through
        return z;
    }
}

#else

// Pade expansion uses less code and fewer registers, but is slower
template<class VTYPE, class BTYPE, int M1, int BA>
static inline VTYPE exp_d(VTYPE const & initial_x) {

    // define constants
    const double ln2p1   = 0.693145751953125;
    const double ln2p2   = 1.42860682030941723212E-6;
    const double log2e   = VM_LOG2E;
    const double max_exp = 708.39;
    // coefficients of pade polynomials
    const double P0exp = 9.99999999999999999910E-1;
    const double P1exp = 3.02994407707441961300E-2;
    const double P2exp = 1.26177193074810590878E-4;
    const double Q0exp = 2.00000000000000000009E0;
    const double Q1exp = 2.27265548208155028766E-1;
    const double Q2exp = 2.52448340349684104192E-3;
    const double Q3exp = 3.00198505138664455042E-6;

    VTYPE x, r, xx, px, qx, y, n2;               // data vectors
    BTYPE inrange;                               // boolean vector

    x = initial_x;
    r = round(initial_x*log2e);

    // subtraction in one step would gives loss of precision
    x -= r * ln2p1;
    x -= r * ln2p2;

    xx = x * x;

    // px = x * P(x^2).
    px = polynomial_2(xx, P0exp, P1exp, P2exp) * x;

    // Evaluate Q(x^2).
    qx = polynomial_3(xx, Q0exp, Q1exp, Q2exp, Q3exp);

    // e^x = 1 + 2*P(x^2)/( Q(x^2) - P(x^2) )
    y = (2.0 * px) / (qx - px);

    // Get 2^n in double.
    // n  = round_to_int64_limited(r);
    // n2 = exp2(n);
    n2 = vm_pow2n(r);  // this is faster

    if (M1 == 0) {
        // exp
        y = (y + 1.0) * n2;
    }
    else {
        // expm1
        y = y * n2 + (n2 - 1.0);
    }

    // overflow
    inrange  = abs(initial_x) < max_exp;
    // check for INF and NAN
    inrange &= is_finite(initial_x);

    if (horizontal_and(inrange)) {
        // fast normal path
        return y;
    }
    else {
        // overflow, underflow and NAN
        r = select(sign_bit(initial_x), 0.-M1, infinite_vec<VTYPE>()); // value in case of overflow or INF
        y = select(inrange, y, r);                                     // +/- overflow
        y = select(is_nan(initial_x), initial_x, y);                   // NAN goes through
        return y;
    }
}
#endif

// instances of exp_d template
static inline Vec2d exp(Vec2d const & x) {
    return exp_d<Vec2d, Vec2db, 0, 0>(x);
}

static inline Vec2d expm1(Vec2d const & x) {
    return exp_d<Vec2d, Vec2db, 1, 0>(x);
}

static inline Vec2d exp2(Vec2d const & x) {
    return exp_d<Vec2d, Vec2db, 0, 2>(x);
}

static inline Vec2d exp10(Vec2d const & x) {
    return exp_d<Vec2d, Vec2db, 0, 10>(x);
}

#if MAX_VECTOR_SIZE >= 256

static inline Vec4d exp(Vec4d const & x) {
    return exp_d<Vec4d, Vec4db, 0, 0>(x);
}

static inline Vec4d expm1(Vec4d const & x) {
    return exp_d<Vec4d, Vec4db, 1, 0>(x);
}

static inline Vec4d exp2(Vec4d const & x) {
    return exp_d<Vec4d, Vec4db, 0, 2>(x);
}

static inline Vec4d exp10(Vec4d const & x) {
    return exp_d<Vec4d, Vec4db, 0, 10>(x);
}

#endif // MAX_VECTOR_SIZE >= 256

#if MAX_VECTOR_SIZE >= 512

static inline Vec8d exp(Vec8d const & x) {
    return exp_d<Vec8d, Vec8db, 0, 0>(x);
}

static inline Vec8d expm1(Vec8d const & x) {
    return exp_d<Vec8d, Vec8db, 1, 0>(x);
}

static inline Vec8d exp2(Vec8d const & x) {
    return exp_d<Vec8d, Vec8db, 0, 2>(x);
}

static inline Vec8d exp10(Vec8d const & x) {
    return exp_d<Vec8d, Vec8db, 0, 10>(x);
}

#endif // MAX_VECTOR_SIZE >= 512

// Template for exp function, single precision
// The limit of abs(x) is defined by max_x below
// This function does not produce denormals
// Template parameters:
// VTYPE: float vector type
// BTYPE: boolean vector type
// M1: 0 for exp, 1 for expm1
// BA: 0 for exp, 1 for 0.5*exp, 2 for pow(2,x), 10 for pow(10,x)

template<class VTYPE, class BTYPE, int M1, int BA>
static inline VTYPE exp_f(VTYPE const & initial_x) {

    // Taylor coefficients
    const float P0expf   =  1.f/2.f;
    const float P1expf   =  1.f/6.f;
    const float P2expf   =  1.f/24.f;
    const float P3expf   =  1.f/120.f;
    const float P4expf   =  1.f/720.f;
    const float P5expf   =  1.f/5040.f;

    VTYPE x, r, x2, z, n2;                       // data vectors
    BTYPE inrange;                               // boolean vector

    // maximum abs(x), value depends on BA, defined below
    // The lower limit of x is slightly more restrictive than the upper limit.
    // We are specifying the lower limit, except for BA = 1 because it is not used for negative x
    float max_x;

    if (BA <= 1) { // exp(x)
        const float ln2f_hi  =  0.693359375f;
        const float ln2f_lo  = -2.12194440e-4f;
        max_x = (BA == 0) ? 87.3f : 89.0f;

        x = initial_x;
        r = round(initial_x*float(VM_LOG2E));
        x = nmul_add(r, VTYPE(ln2f_hi), x);      //  x -= r * ln2f_hi;
        x = nmul_add(r, VTYPE(ln2f_lo), x);      //  x -= r * ln2f_lo;
    }
    else if (BA == 2) {                          // pow(2,x)
        max_x = 126.f;
        r = round(initial_x);
        x = initial_x - r;
        x = x * (float)VM_LN2;
    }
    else if (BA == 10) {                         // pow(10,x)
        max_x = 37.9f;
        const float log10_2_hi = 0.301025391f;   // log10(2) in two parts
        const float log10_2_lo = 4.60503907E-6f;
        x = initial_x;
        r = round(initial_x*float(VM_LOG2E*VM_LN10));
        x = nmul_add(r, VTYPE(log10_2_hi), x);   //  x -= r * log10_2_hi;
        x = nmul_add(r, VTYPE(log10_2_lo), x);   //  x -= r * log10_2_lo;
        x = x * (float)VM_LN10;
    }
    else  {  // undefined value of BA
        return 0.;
    }

    x2 = x * x;
    z = polynomial_5(x,P0expf,P1expf,P2expf,P3expf,P4expf,P5expf);
    z = mul_add(z, x2, x);                       // z *= x2;  z += x;

    if (BA == 1) r--;                            // 0.5 * exp(x)

    // multiply by power of 2
    n2 = vm_pow2n(r);

    if (M1 == 0) {
        // exp
        z = (z + 1.0f) * n2;
    }
    else {
        // expm1
        z = mul_add(z, n2, n2 - 1.0f);           //  z = z * n2 + (n2 - 1.0f);
    }

    // check for overflow
    inrange  = abs(initial_x) < max_x;
    // check for INF and NAN
    inrange &= is_finite(initial_x);

    if (horizontal_and(inrange)) {
        // fast normal path
        return z;
    }
    else {
        // overflow, underflow and NAN
        r = select(sign_bit(initial_x), 0.f-M1, infinite_vec<VTYPE>()); // value in case of +/- overflow or INF
        z = select(inrange, z, r);                                      // +/- underflow
        z = select(is_nan(initial_x), initial_x, z);                    // NAN goes through
        return z;
    }
}

// instances of exp_f template
static inline Vec4f exp(Vec4f const & x) {
    return exp_f<Vec4f, Vec4fb, 0, 0>(x);
}

static inline Vec4f expm1(Vec4f const & x) {
    return exp_f<Vec4f, Vec4fb, 1, 0>(x);
}

static inline Vec4f exp2(Vec4f const & x) {
    return exp_f<Vec4f, Vec4fb, 0, 2>(x);
}

static inline Vec4f exp10(Vec4f const & x) {
    return exp_f<Vec4f, Vec4fb, 0, 10>(x);
}

#if MAX_VECTOR_SIZE >= 256

static inline Vec8f exp(Vec8f const & x) {
    return exp_f<Vec8f, Vec8fb, 0, 0>(x);
}

static inline Vec8f expm1(Vec8f const & x) {
    return exp_f<Vec8f, Vec8fb, 1, 0>(x);
}

static inline Vec8f exp2(Vec8f const & x) {
    return exp_f<Vec8f, Vec8fb, 0, 2>(x);
}

static inline Vec8f exp10(Vec8f const & x) {
    return exp_f<Vec8f, Vec8fb, 0, 10>(x);
}

#endif // MAX_VECTOR_SIZE >= 256

#if MAX_VECTOR_SIZE >= 512

static inline Vec16f exp(Vec16f const & x) {
    return exp_f<Vec16f, Vec16fb, 0, 0>(x);
}

static inline Vec16f expm1(Vec16f const & x) {
    return exp_f<Vec16f, Vec16fb, 1, 0>(x);
}

static inline Vec16f exp2(Vec16f const & x) {
    return exp_f<Vec16f, Vec16fb, 0, 2>(x);
}

static inline Vec16f exp10(Vec16f const & x) {
    return exp_f<Vec16f, Vec16fb, 0, 10>(x);
}

#endif // MAX_VECTOR_SIZE >= 512


/******************************************************************************
*                 Logarithm functions
******************************************************************************/

// Helper functions: fraction_2(x) = fraction(x)*0.5

// Modified fraction function:
// Extract the fraction part of a floating point number, and divide by 2
// The fraction function is defined in vectorf128.h etc.
// fraction_2(x) = fraction(x)*0.5
// This version gives half the fraction without extra delay
// Does not work for x = 0
static inline Vec4f fraction_2(Vec4f const & a) {
    Vec4ui t1 = _mm_castps_si128(a);   // reinterpret as 32-bit integer
    Vec4ui t2 = Vec4ui((t1 & 0x007FFFFF) | 0x3F000000); // set exponent to 0 + bias
    return _mm_castsi128_ps(t2);
}

static inline Vec2d fraction_2(Vec2d const & a) {
    Vec2uq t1 = _mm_castpd_si128(a);   // reinterpret as 64-bit integer
    Vec2uq t2 = Vec2uq((t1 & 0x000FFFFFFFFFFFFFll) | 0x3FE0000000000000ll); // set exponent to 0 + bias
    return _mm_castsi128_pd(t2);
}

#if MAX_VECTOR_SIZE >= 256

static inline Vec8f fraction_2(Vec8f const & a) {
#if defined (VECTORI256_H) && VECTORI256_H > 2  // 256 bit integer vectors are available, AVX2
    Vec8ui t1 = _mm256_castps_si256(a);   // reinterpret as 32-bit integer
    Vec8ui t2 = (t1 & 0x007FFFFF) | 0x3F000000; // set exponent to 0 + bias
    return _mm256_castsi256_ps(t2);
#else
    return Vec8f(fraction_2(a.get_low()), fraction_2(a.get_high()));
#endif
}

static inline Vec4d fraction_2(Vec4d const & a) {
#if VECTORI256_H > 1  // AVX2
    Vec4uq t1 = _mm256_castpd_si256(a);   // reinterpret as 64-bit integer
    Vec4uq t2 = Vec4uq((t1 & 0x000FFFFFFFFFFFFFll) | 0x3FE0000000000000ll); // set exponent to 0 + bias
    return _mm256_castsi256_pd(t2);
#else
    return Vec4d(fraction_2(a.get_low()), fraction_2(a.get_high()));
#endif
}

#endif // MAX_VECTOR_SIZE >= 256

#if MAX_VECTOR_SIZE >= 512

static inline Vec16f fraction_2(Vec16f const & a) {
#if INSTRSET >= 9                    // 512 bit integer vectors are available, AVX512
    return _mm512_getmant_ps(a, _MM_MANT_NORM_p5_1, _MM_MANT_SIGN_zero);
    //return Vec16f(_mm512_getmant_ps(a, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_zero)) * 0.5f;
#else
    return Vec16f(fraction_2(a.get_low()), fraction_2(a.get_high()));
#endif
}

static inline Vec8d fraction_2(Vec8d const & a) {
#if INSTRSET >= 9                    // 512 bit integer vectors are available, AVX512
    return _mm512_getmant_pd(a, _MM_MANT_NORM_p5_1, _MM_MANT_SIGN_zero);
    //return Vec8d(_mm512_getmant_pd(a, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_zero)) * 0.5;
#else
    return Vec8d(fraction_2(a.get_low()), fraction_2(a.get_high()));
#endif
}

#endif // MAX_VECTOR_SIZE >= 512


// Helper functions: exponent_f(x) = exponent(x) as floating point number

union vm_ufi {
    float f;
    uint32_t i;
};

union vm_udi {
    double d;
    uint64_t i;
};

// extract exponent of a positive number x as a floating point number
static inline Vec4f exponent_f(Vec4f const & x) {
    const float pow2_23 =  8388608.0f;           // 2^23
    const float bias = 127.f;                    // bias in exponent
    const vm_ufi upow2_23 = {pow2_23};
    Vec4ui a = reinterpret_i(x);                 // bit-cast x to integer
    Vec4ui b = a >> 23;                          // shift down exponent to low bits
    Vec4ui c = b | Vec4ui(upow2_23.i);           // insert new exponent
    Vec4f  d = reinterpret_f(c);                 // bit-cast back to double
    Vec4f  e = d - (pow2_23 + bias);             // subtract magic number and bias
    return e;
}

static inline Vec2d exponent_f(Vec2d const & x) {
    const double pow2_52 = 4503599627370496.0;   // 2^52
    const double bias = 1023.0;                  // bias in exponent
    const vm_udi upow2_52 = {pow2_52};

    Vec2uq a = reinterpret_i(x);                 // bit-cast x to integer
    Vec2uq b = a >> 52;                          // shift down exponent to low bits
    Vec2uq c = b | Vec2uq(upow2_52.i);           // insert new exponent
    Vec2d  d = reinterpret_d(c);                 // bit-cast back to double
    Vec2d  e = d - (pow2_52 + bias);             // subtract magic number and bias
    return e;
}

#if MAX_VECTOR_SIZE >= 256

static inline Vec8f exponent_f(Vec8f const & x) {
    const float pow2_23 =  8388608.0f;           // 2^23
    const float bias = 127.f;                    // bias in exponent
    const vm_ufi upow2_23 = {pow2_23};
    Vec8ui a = reinterpret_i(x);                 // bit-cast x to integer
    Vec8ui b = a >> 23;                          // shift down exponent to low bits
    Vec8ui c = b | Vec8ui(upow2_23.i);           // insert new exponent
    Vec8f  d = reinterpret_f(c);                 // bit-cast back to double
    Vec8f  e = d - (pow2_23 + bias);             // subtract magic number and bias
    return e;
}

// extract exponent of a positive number x as a floating point number
static inline Vec4d exponent_f(Vec4d const & x) {
    const double pow2_52 = 4503599627370496.0;   // 2^52
    const double bias = 1023.0;                  // bias in exponent
    const vm_udi upow2_52 = {pow2_52};

    Vec4uq a = reinterpret_i(x);                 // bit-cast x to integer
    Vec4uq b = a >> 52;                          // shift down exponent to low bits
    Vec4uq c = b | Vec4uq(upow2_52.i);           // insert new exponent
    Vec4d  d = reinterpret_d(c);                 // bit-cast back to double
    Vec4d  e = d - (pow2_52 + bias);             // subtract magic number and bias
    return e;
}

#endif // MAX_VECTOR_SIZE >= 256

#if MAX_VECTOR_SIZE >= 512

static inline Vec16f exponent_f(Vec16f const & x) {
#if INSTRSET >= 9                                // AVX512
    return _mm512_getexp_ps(x);
#else
    return Vec16f(exponent_f(x.get_low()), exponent_f(x.get_high()));
#endif
}

// extract exponent of a positive number x as a floating point number
static inline Vec8d exponent_f(Vec8d const & x) {
#if INSTRSET >= 9                                // AVX512
    return _mm512_getexp_pd(x);
#else
    return Vec8d(exponent_f(x.get_low()), exponent_f(x.get_high()));
#endif
}

#endif // MAX_VECTOR_SIZE >= 512


// log function, double precision
// template parameters:
// VTYPE: f.p. vector type
// BTYPE: boolean vector type
// M1: 0 for log, 1 for log1p
template<class VTYPE, class BTYPE, int M1>
static inline VTYPE log_d(VTYPE const & initial_x) {

    // define constants
    const double ln2_hi =  0.693359375;
    const double ln2_lo = -2.121944400546905827679E-4;
    const double P0log  =  7.70838733755885391666E0;
    const double P1log  =  1.79368678507819816313E1;
    const double P2log  =  1.44989225341610930846E1;
    const double P3log  =  4.70579119878881725854E0;
    const double P4log  =  4.97494994976747001425E-1;
    const double P5log  =  1.01875663804580931796E-4;
    const double Q0log  =  2.31251620126765340583E1;
    const double Q1log  =  7.11544750618563894466E1;
    const double Q2log  =  8.29875266912776603211E1;
    const double Q3log  =  4.52279145837532221105E1;
    const double Q4log  =  1.12873587189167450590E1;

    VTYPE x1, x, x2, px, qx, res, fe;            // data vectors
    BTYPE blend, overflow, underflow;            // boolean vectors

    if (M1 == 0) {
        x1 = initial_x;                          // log(x)
    }
    else {
        x1 = initial_x + 1.0;                    // log(x+1)
    }
    // separate mantissa from exponent
    // VTYPE x  = fraction(x1) * 0.5;
    x  = fraction_2(x1);
    fe = exponent_f(x1);

    blend = x > VM_SQRT2*0.5;
    x  = if_add(!blend, x, x);                   // conditional add
    fe = if_add(blend, fe, 1.);                  // conditional add

    if (M1 == 0) {
        // log(x). Expand around 1.0
        x -= 1.0;
    }
    else {
        // log(x+1). Avoid loss of precision when adding 1 and later subtracting 1 if exponent = 0
        x = select(fe==0., initial_x, x - 1.0);
    }

    // rational form
    px  = polynomial_5 (x, P0log, P1log, P2log, P3log, P4log, P5log);
    x2  = x * x;
    px *= x * x2;
    qx  = polynomial_5n(x, Q0log, Q1log, Q2log, Q3log, Q4log);
    res = px / qx ;

    // add exponent
    res  = mul_add(fe, ln2_lo, res);             // res += fe * ln2_lo;
    res += nmul_add(x2, 0.5, x);                 // res += x  - 0.5 * x2;
    res  = mul_add(fe, ln2_hi, res);             // res += fe * ln2_hi;

    overflow  = !is_finite(x1);
    underflow = x1 < VM_SMALLEST_NORMAL;         // denormals not supported by this functions

    if (!horizontal_or(overflow | underflow)) {
        // normal path
        return res;
    }
    else {
        // overflow and underflow
        res = select(underflow, nan_vec<VTYPE>(NAN_LOG), res);                   // x1  < 0 gives NAN
        res = select(x1 == 0. || is_subnormal(x1), -infinite_vec<VTYPE>(), res); // x1 == 0 gives -INF
        res = select(overflow,  x1, res);                                        // INF or NAN goes through
        res = select(is_inf(x1)&sign_bit(x1), nan_vec<VTYPE>(NAN_LOG), res);     // -INF gives NAN
        return res;
    }
}


static inline Vec2d log(Vec2d const & x) {
    return log_d<Vec2d, Vec2db, 0>(x);
}

static inline Vec2d log1p(Vec2d const & x) {
    return log_d<Vec2d, Vec2db, 1>(x);
}

static inline Vec2d log2(Vec2d const & x) {
    return VM_LOG2E * log_d<Vec2d, Vec2db, 0>(x);
}

static inline Vec2d log10(Vec2d const & x) {
    return VM_LOG10E * log_d<Vec2d, Vec2db, 0>(x);
}

#if MAX_VECTOR_SIZE >= 256

static inline Vec4d log(Vec4d const & x) {
    return log_d<Vec4d, Vec4db, 0>(x);
}

static inline Vec4d log1p(Vec4d const & x) {
    return log_d<Vec4d, Vec4db, 1>(x);
}

static inline Vec4d log2(Vec4d const & x) {
    return VM_LOG2E * log_d<Vec4d, Vec4db, 0>(x);
}

static inline Vec4d log10(Vec4d const & x) {
    return VM_LOG10E * log_d<Vec4d, Vec4db, 0>(x);
}

#endif // MAX_VECTOR_SIZE >= 256

#if MAX_VECTOR_SIZE >= 512

static inline Vec8d log(Vec8d const & x) {
    return log_d<Vec8d, Vec8db, 0>(x);
}

static inline Vec8d log1p(Vec8d const & x) {
    return log_d<Vec8d, Vec8db, 1>(x);
}

static inline Vec8d log2(Vec8d const & x) {
    return VM_LOG2E * log_d<Vec8d, Vec8db, 0>(x);
}

static inline Vec8d log10(Vec8d const & x) {
    return VM_LOG10E * log_d<Vec8d, Vec8db, 0>(x);
}

#endif // MAX_VECTOR_SIZE >= 512


// log function, single precision
// template parameters:
// VTYPE: f.p. vector type
// ITYPE: integer vector type with same element size
// BTYPE: boolean vector type
// BTYPEI: boolean vector type for ITYPE
// M1: 0 for log, 1 for log1p
template<class VTYPE, class ITYPE, class BTYPE, class BTYPEI, int M1>
static inline VTYPE log_f(VTYPE const & initial_x) {

    // define constants
    const float ln2f_hi =  0.693359375f;
    const float ln2f_lo = -2.12194440E-4f;
    const float P0logf  =  3.3333331174E-1f;
    const float P1logf  = -2.4999993993E-1f;
    const float P2logf  =  2.0000714765E-1f;
    const float P3logf  = -1.6668057665E-1f;
    const float P4logf  =  1.4249322787E-1f;
    const float P5logf  = -1.2420140846E-1f;
    const float P6logf  =  1.1676998740E-1f;
    const float P7logf  = -1.1514610310E-1f;
    const float P8logf  =  7.0376836292E-2f;

    VTYPE x1, x, res, x2, fe;                    // data vectors
    ITYPE e;                                     // integer vector
    BTYPE blend, overflow, underflow;            // boolean vectors

    if (M1 == 0) {
        x1 = initial_x;                          // log(x)
    }
    else {
        x1 = initial_x + 1.0f;                   // log(x+1)
    }

    // separate mantissa from exponent
    x = fraction_2(x1);
    e = exponent(x1);

    blend = x > float(VM_SQRT2*0.5);
    x  = if_add(!blend, x, x);                   // conditional add
    e  = if_add(BTYPEI(blend),  e, ITYPE(1));    // conditional add
    fe = to_float(e);

    if (M1 == 0) {
        // log(x). Expand around 1.0
        x -= 1.0f;
    }
    else {
        // log(x+1). Avoid loss of precision when adding 1 and later subtracting 1 if exponent = 0
        x = select(BTYPE(e==0), initial_x, x - 1.0f);
    }

    // Taylor expansion
    res = polynomial_8(x, P0logf, P1logf, P2logf, P3logf, P4logf, P5logf, P6logf, P7logf, P8logf);
    x2  = x*x;
    res *= x2*x;

    // add exponent
    res  = mul_add(fe, ln2f_lo, res);            // res += ln2f_lo  * fe;
    res += nmul_add(x2, 0.5f, x);                // res += x - 0.5f * x2;
    res  = mul_add(fe, ln2f_hi, res);            // res += ln2f_hi  * fe;

    overflow  = !is_finite(x1);
    underflow = x1 < VM_SMALLEST_NORMALF;        // denormals not supported by this functions

    if (!horizontal_or(overflow | underflow)) {
        // normal path
        return res;
    }
    else {
        // overflow and underflow
        res = select(underflow, nan_vec<VTYPE>(NAN_LOG), res);                    // x1 < 0 gives NAN
        res = select(x1 == 0.f || is_subnormal(x1), -infinite_vec<VTYPE>(), res); // x1 == 0 or denormal gives -INF
        res = select(overflow,  x1, res);                                         // INF or NAN goes through
        res = select(is_inf(x1)&sign_bit(x1), nan_vec<VTYPE>(NAN_LOG), res);      // -INF gives NAN
        return res;
    }
}

static inline Vec4f log(Vec4f const & x) {
    return log_f<Vec4f, Vec4i, Vec4fb, Vec4ib, 0>(x);
}

static inline Vec4f log1p(Vec4f const & x) {
    return log_f<Vec4f, Vec4i, Vec4fb, Vec4ib, 1>(x);
}

static inline Vec4f log2(Vec4f const & x) {
    return float(VM_LOG2E) * log_f<Vec4f, Vec4i, Vec4fb, Vec4ib, 0>(x);
}

static inline Vec4f log10(Vec4f const & x) {
    return float(VM_LOG10E) * log_f<Vec4f, Vec4i, Vec4fb, Vec4ib, 0>(x);
}

#if MAX_VECTOR_SIZE >= 256

static inline Vec8f log(Vec8f const & x) {
    return log_f<Vec8f, Vec8i, Vec8fb, Vec8ib, 0>(x);
}

static inline Vec8f log1p(Vec8f const & x) {
    return log_f<Vec8f, Vec8i, Vec8fb, Vec8ib, 1>(x);
}

static inline Vec8f log2(Vec8f const & x) {
    return float(VM_LOG2E) * log_f<Vec8f, Vec8i, Vec8fb, Vec8ib, 0>(x);
}

static inline Vec8f log10(Vec8f const & x) {
    return float(VM_LOG10E) * log_f<Vec8f, Vec8i, Vec8fb, Vec8ib, 0>(x);
}

#endif // MAX_VECTOR_SIZE >= 256

#if MAX_VECTOR_SIZE >= 512

static inline Vec16f log(Vec16f const & x) {
    return log_f<Vec16f, Vec16i, Vec16fb, Vec16ib, 0>(x);
}

static inline Vec16f log1p(Vec16f const & x) {
    return log_f<Vec16f, Vec16i, Vec16fb, Vec16ib, 1>(x);
}

static inline Vec16f log2(Vec16f const & x) {
    return float(VM_LOG2E) * log_f<Vec16f, Vec16i, Vec16fb, Vec16ib, 0>(x);
}

static inline Vec16f log10(Vec16f const & x) {
    return float(VM_LOG10E) * log_f<Vec16f, Vec16i, Vec16fb, Vec16ib, 0>(x);
}

#endif // MAX_VECTOR_SIZE >= 512


/******************************************************************************
*           Cube root and reciprocal cube root
******************************************************************************/

// cube root template, double precision
// template parameters:
// VTYPE:  f.p. vector type
// ITYPE:  uint32_t integer vector type with same total number of bits
// ITYPE2: uint64_t integer vector type with same total number of bits
// BTYPE:  boolean vector type
// CR:     -1 for reciprocal cube root, 1 for cube root, 2 for cube root squared
template<class VTYPE, class ITYPE, class ITYPE2, class BTYPE, int CR>
static inline VTYPE cbrt_d(VTYPE const & x) {
    const int iter = 7;     // iteration count of x^(-1/3) loop
    int i;
    VTYPE  xa, xa3, a, a2;
    ITYPE  m1, m2;
    BTYPE  underflow;
    ITYPE2 q1(0x5540000000000000ULL);            // exponent bias
    ITYPE2 q2(0x0005555500000000ULL);            // exponent multiplier for 1/3
    ITYPE2 q3(0x0010000000000000ULL);            // denormal limit
    const double one_third  = 1./3.;
    const double four_third = 4./3.;

    xa  = abs(x);
    xa3 = one_third*xa;

    // multiply exponent by -1/3
    m1 = reinterpret_i(xa);
    m2 = ITYPE(q1) - (m1 >> 20) * ITYPE(q2);
    a  = reinterpret_d(m2);
    underflow = BTYPE(ITYPE2(m1) < q3);          // true if denormal or zero

    // Newton Raphson iteration
    for (i = 0; i < iter-1; i++) {
        a2 = a * a;
        a = nmul_add(xa3, a2*a2, four_third*a);  // a = four_third*a - xa3*a2*a2;
    }
    // last iteration with better precision
    a2 = a * a;
    a = mul_add(one_third, nmul_add(xa, a2*a2, a), a); // a = a + one_third*(a - xa*a2*a2);

    if (CR == -1) {  // reciprocal cube root
        // (note: gives wrong sign when input is INF)
        // generate INF if underflow
        a = select(underflow, infinite_vec<VTYPE>(), a);
        // get sign
        a = sign_combine(a, x);
    }
    else if (CR == 1) {     // cube root
        a = a * a * x;
        // generate 0 if underflow
        a = select(underflow, 0., a);
    }
    else if (CR == 2) {     // cube root squared
        // (note: gives wrong sign when input is INF)
        a = a * xa;
        // generate 0 if underflow
        a = select(underflow, 0., a);
    }
    return a;
}

// template instances for cbrt and reciprocal_cbrt

// cube root
static inline Vec2d cbrt(Vec2d const & x) {
    return cbrt_d<Vec2d, Vec4ui, Vec2uq, Vec2db, 1> (x);
}

// reciprocal cube root
static inline Vec2d reciprocal_cbrt(Vec2d const & x) {
    return cbrt_d<Vec2d, Vec4ui, Vec2uq, Vec2db, -1> (x);
}

// square cube root
static inline Vec2d square_cbrt(Vec2d const & x) {
    return cbrt_d<Vec2d, Vec4ui, Vec2uq, Vec2db, 2> (x);
}

#if MAX_VECTOR_SIZE >= 256

static inline Vec4d cbrt(Vec4d const & x) {
    return cbrt_d<Vec4d, Vec8ui, Vec4uq, Vec4db, 1> (x);
}

static inline Vec4d reciprocal_cbrt(Vec4d const & x) {
    return cbrt_d<Vec4d, Vec8ui, Vec4uq, Vec4db, -1> (x);
}

static inline Vec4d square_cbrt(Vec4d const & x) {
    return cbrt_d<Vec4d, Vec8ui, Vec4uq, Vec4db, 2> (x);
}

#endif // MAX_VECTOR_SIZE >= 256

#if MAX_VECTOR_SIZE >= 512

static inline Vec8d cbrt(Vec8d const & x) {
    return cbrt_d<Vec8d, Vec16ui, Vec8uq, Vec8db, 1> (x);
}

static inline Vec8d reciprocal_cbrt(Vec8d const & x) {
    return cbrt_d<Vec8d, Vec16ui, Vec8uq, Vec8db, -1> (x);
}

static inline Vec8d square_cbrt(Vec8d const & x) {
    return cbrt_d<Vec8d, Vec16ui, Vec8uq, Vec8db, 2> (x);
}

#endif // MAX_VECTOR_SIZE >= 512


// cube root template, single precision
// template parameters:
// VTYPE:  f.p. vector type
// ITYPE:  uint32_t integer vector type
// BTYPE:  boolean vector type
// CR:     -1 for reciprocal cube root, 1 for cube root, 2 for cube root squared
template<class VTYPE, class ITYPE, class BTYPE, int CR>
static inline VTYPE cbrt_f(VTYPE const & x) {

    const int iter = 6;                          // iteration count of x^(-1/3) loop
    int i;
    VTYPE  xa, xa3, a, a2;
    ITYPE  m1, m2;
    BTYPE  underflow;
    ITYPE  q1(0x54800000U);                      // exponent bias
    ITYPE  q2(0x002AAAAAU);                      // exponent multiplier for 1/3
    ITYPE  q3(0x00800000U);                      // denormal limit
    const  float one_third  = float(1./3.);
    const  float four_third = float(4./3.);

    xa  = abs(x);
    xa3 = one_third*xa;

    // multiply exponent by -1/3
    m1 = reinterpret_i(xa);
    m2 = q1 - (m1 >> 23) * q2;
    a  = reinterpret_f(m2);

    underflow = BTYPE(m1 < q3);                  // true if denormal or zero

    // Newton Raphson iteration
    for (i = 0; i < iter-1; i++) {
        a2 = a*a;
        a = nmul_add(xa3, a2*a2, four_third*a);  // a = four_third*a - xa3*a2*a2;
    }
    // last iteration with better precision
    a2 = a*a;
    a = mul_add(one_third, nmul_add(xa, a2*a2, a), a); //a = a + one_third*(a - xa*a2*a2);

    if (CR == -1) {                              // reciprocal cube root
        // generate INF if underflow
        a = select(underflow, infinite_vec<VTYPE>(), a);
        // get sign
        a = sign_combine(a, x);
    }
    else if (CR == 1) {                          // cube root
        a = a * a * x;
        // generate 0 if underflow
        a = select(underflow, 0., a);
    }
    else if (CR == 2) {                          // cube root squared
        a = a * xa;
        // generate 0 if underflow
        a = select(underflow, 0., a);
    }
    return a;
}

// template instances for cbrt and reciprocal_cbrt

// cube root
static inline Vec4f cbrt(Vec4f const & x) {
    return cbrt_f<Vec4f, Vec4ui, Vec4fb, 1> (x);
}

// reciprocal cube root
static inline Vec4f reciprocal_cbrt(Vec4f const & x) {
    return cbrt_f<Vec4f, Vec4ui, Vec4fb, -1> (x);
}

// square cube root
static inline Vec4f square_cbrt(Vec4f const & x) {
    return cbrt_f<Vec4f, Vec4ui, Vec4fb, 2> (x);
}

#if MAX_VECTOR_SIZE >= 256

static inline Vec8f cbrt(Vec8f const & x) {
    return cbrt_f<Vec8f, Vec8ui, Vec8fb, 1> (x);
}

static inline Vec8f reciprocal_cbrt(Vec8f const & x) {
    return cbrt_f<Vec8f, Vec8ui, Vec8fb, -1> (x);
}

static inline Vec8f square_cbrt(Vec8f const & x) {
    return cbrt_f<Vec8f, Vec8ui, Vec8fb, 2> (x);
}

#endif // MAX_VECTOR_SIZE >= 256

#if MAX_VECTOR_SIZE >= 512

static inline Vec16f cbrt(Vec16f const & x) {
    return cbrt_f<Vec16f, Vec16ui, Vec16fb, 1> (x);
}

static inline Vec16f reciprocal_cbrt(Vec16f const & x) {
    return cbrt_f<Vec16f, Vec16ui, Vec16fb, -1> (x);
}

static inline Vec16f square_cbrt(Vec16f const & x) {
    return cbrt_f<Vec16f, Vec16ui, Vec16fb, 2> (x);
}

#endif // MAX_VECTOR_SIZE >= 512


// ****************************************************************************
//                pow template, double precision
// ****************************************************************************
// Calculate x to the power of y.

// Precision is important here because rounding errors get multiplied by y.
// The logarithm is calculated with extra precision, and the exponent is
// calculated separately.
// The logarithm is calculated by Pad\E9 approximation with 6'th degree
// polynomials. A 7'th degree would be preferred for best precision by high y.
// The alternative method: log(x) = z + z^3*R(z)/S(z), where z = 2(x-1)/(x+1)
// did not give better precision.

// Template parameters:
// VTYPE:  data vector type
// ITYPE:  signed integer vector type
// BTYPE:  boolean vector type
template <class VTYPE, class ITYPE, class BTYPE>
static inline VTYPE pow_template_d(VTYPE const & x0, VTYPE const & y) {

    // define constants
    const double ln2d_hi = 0.693145751953125;           // log(2) in extra precision, high bits
    const double ln2d_lo = 1.42860682030941723212E-6;   // low bits of log(2)
    const double log2e   = VM_LOG2E;                    // 1/log(2)
    const double pow2_52 = 4503599627370496.0;          // 2^52

    // coefficients for Pad\E9 polynomials
    const double P0logl =  2.0039553499201281259648E1;
    const double P1logl =  5.7112963590585538103336E1;
    const double P2logl =  6.0949667980987787057556E1;
    const double P3logl =  2.9911919328553073277375E1;
    const double P4logl =  6.5787325942061044846969E0;
    const double P5logl =  4.9854102823193375972212E-1;
    const double P6logl =  4.5270000862445199635215E-5;
    const double Q0logl =  6.0118660497603843919306E1;
    const double Q1logl =  2.1642788614495947685003E2;
    const double Q2logl =  3.0909872225312059774938E2;
    const double Q3logl =  2.2176239823732856465394E2;
    const double Q4logl =  8.3047565967967209469434E1;
    const double Q5logl =  1.5062909083469192043167E1;

    // Taylor coefficients for exp function, 1/n!
    const double p2  = 1./2.;
    const double p3  = 1./6.;
    const double p4  = 1./24.;
    const double p5  = 1./120.;
    const double p6  = 1./720.;
    const double p7  = 1./5040.;
    const double p8  = 1./40320.;
    const double p9  = 1./362880.;
    const double p10 = 1./3628800.;
    const double p11 = 1./39916800.;
    const double p12 = 1./479001600.;
    const double p13 = 1./6227020800.;

    // data vectors
    VTYPE x, x1, x2;
    VTYPE px, qx, ef, yr, v, z, z1;
    VTYPE lg, lg1, lg2;
    VTYPE lgerr, x2err;
    VTYPE e1, e2, e3, ee;
    // integer vectors
    ITYPE ei, ej, yodd;
    // boolean vectors
    BTYPE blend, xzero, xnegative;
    BTYPE overflow, underflow, xfinite, yfinite, efinite;

    // remove sign
    x1 = abs(x0);

    // Separate mantissa from exponent
    // This gives the mantissa * 0.5
    x  = fraction_2(x1);

    // reduce range of x = +/- sqrt(2)/2
    blend = x > VM_SQRT2*0.5;
    x  = if_add(!blend, x, x);                   // conditional add

    // Pade approximation
    // Higher precision than in log function. Still higher precision wanted
    x -= 1.0;
    x2 = x*x;
    px = polynomial_6  (x, P0logl, P1logl, P2logl, P3logl, P4logl, P5logl, P6logl);
    px *= x * x2;
    qx = polynomial_6n (x, Q0logl, Q1logl, Q2logl, Q3logl, Q4logl, Q5logl);
    lg1 = px / qx;

    // extract exponent
    ef = exponent_f(x1);
    ef = if_add(blend, ef, 1.);                  // conditional add

    // multiply exponent by y
    // nearest integer e1 goes into exponent of result, remainder yr is added to log
    e1 = round(ef * y);
    yr = mul_sub_x(ef, y, e1);                   // calculate remainder yr. precision very important here

    // add initial terms to Pade expansion
    lg = nmul_add(0.5, x2, x) + lg1;             // lg = (x - 0.5 * x2) + lg1;
    // calculate rounding errors in lg
    // rounding error in multiplication 0.5*x*x
    x2err = mul_sub_x(0.5*x, x, 0.5*x2);
    // rounding error in additions and subtractions
    lgerr = mul_add(0.5, x2, lg - x) - lg1;      // lgerr = ((lg - x) + 0.5 * x2) - lg1;

    // extract something for the exponent
    e2 = round(lg * y * VM_LOG2E);
    // subtract this from lg, with extra precision
    v = mul_sub_x(lg, y, e2 * ln2d_hi);
    v = nmul_add(e2, ln2d_lo, v);                // v -= e2 * ln2d_lo;

    // add remainder from ef * y
    v = mul_add(yr, VM_LN2, v);                  // v += yr * VM_LN2;

    // correct for previous rounding errors
    v = nmul_add(lgerr + x2err, y, v);           // v -= (lgerr + x2err) * y;

    // exp function

    // extract something for the exponent if possible
    x = v;
    e3 = round(x*log2e);
    // high precision multiplication not needed here because abs(e3) <= 1
    x = nmul_add(e3, VM_LN2, x);                 // x -= e3 * VM_LN2;

    z = polynomial_13m(x, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13);
    z = z + 1.0;

    // contributions to exponent
    ee = e1 + e2 + e3;
    ei = round_to_int64_limited(ee);
    // biased exponent of result:
    ej = ei + (ITYPE(reinterpret_i(z)) >> 52);
    // check exponent for overflow and underflow
    overflow  = BTYPE(ej >= 0x07FF) | (ee >  3000.);
    underflow = BTYPE(ej <= 0x0000) | (ee < -3000.);

    // add exponent by integer addition
    z = reinterpret_d(ITYPE(reinterpret_i(z)) + (ei << 52));

    // check for special cases
    xfinite   = is_finite(x0);
    yfinite   = is_finite(y);
    efinite   = is_finite(ee);
    xzero     = is_zero_or_subnormal(x0);
    xnegative = x0  < 0.;

    // check for overflow and underflow
    if (horizontal_or(overflow | underflow)) {
        // handle errors
        z = select(underflow, VTYPE(0.), z);
        z = select(overflow, infinite_vec<VTYPE>(), z);
    }

    // check for x == 0
    z = select(xzero, select(y < 0., infinite_vec<VTYPE>(), select(y == 0., VTYPE(1.), VTYPE(0.))), z);

    // check for x < 0. y must be integer
    if (horizontal_or(xnegative)) {
        // test if y odd
        yodd = ITYPE(reinterpret_i(abs(y) + pow2_52)) << 63;     // convert y to integer and shift bit 0 to position of sign bit
        z1 = z | (x0 & VTYPE(reinterpret_d(yodd)));              // apply sign if y odd
        z1 = select(y == round(y), z1, nan_vec<VTYPE>(NAN_POW)); // NAN if y not integer
        z = select(xnegative, z1, z);
    }

    // check for range errors
    if (horizontal_and(xfinite & yfinite & efinite)) {
        // fast return if no special cases
        return z;
    }
    // handle special error cases
    z = select(yfinite & efinite, z, select(x1 == 1., VTYPE(1.), select((x1 > 1.) ^ sign_bit(y), infinite_vec<VTYPE>(), 0.)));
    yodd = ITYPE(reinterpret_i(abs(y) + pow2_52)) << 63; // same as above
    z = select(xfinite, z, select(y == 0., VTYPE(1.), select(y < 0., VTYPE(0.), infinite_vec<VTYPE>() | ( VTYPE(reinterpret_d(yodd)) & x0))));
    z = select(is_nan(x0), select(is_nan(y), x0 | y, x0), select(is_nan(y), y, z));
    return z;
};


//This template is in vectorf128.h to prevent implicit conversion of float y to int when float version is not defined:
//template <typename TT> static Vec2d pow(Vec2d const & a, TT n);

// instantiations of pow_template_d:
template <>
inline Vec2d pow<Vec2d const &>(Vec2d const & x, Vec2d const & y) {
    return pow_template_d<Vec2d, Vec2q, Vec2db>(x, y);
}

template <>
inline Vec2d pow<double>(Vec2d const & x, double y) {
    return pow_template_d<Vec2d, Vec2q, Vec2db>(x, y);
}
template <>
inline Vec2d pow<float>(Vec2d const & x, float y) {
    return pow_template_d<Vec2d, Vec2q, Vec2db>(x, (double)y);
}

#if MAX_VECTOR_SIZE >= 256

template <>
inline Vec4d pow<Vec4d const &>(Vec4d const & x, Vec4d const & y) {
    return pow_template_d<Vec4d, Vec4q, Vec4db>(x, y);
}

template <>
inline Vec4d pow<double>(Vec4d const & x, double y) {
    return pow_template_d<Vec4d, Vec4q, Vec4db>(x, y);
}

template <>
inline Vec4d pow<float>(Vec4d const & x, float y) {
    return pow_template_d<Vec4d, Vec4q, Vec4db>(x, (double)y);
}

#endif // MAX_VECTOR_SIZE >= 256

#if MAX_VECTOR_SIZE >= 512

template <>
inline Vec8d pow<Vec8d const &>(Vec8d const & x, Vec8d const & y) {
    return pow_template_d<Vec8d, Vec8q, Vec8db>(x, y);
}

template <>
inline Vec8d pow<double>(Vec8d const & x, double y) {
    return pow_template_d<Vec8d, Vec8q, Vec8db>(x, y);
}

template <>
inline Vec8d pow<float>(Vec8d const & x, float y) {
    return pow_template_d<Vec8d, Vec8q, Vec8db>(x, (double)y);
}

#endif // MAX_VECTOR_SIZE >= 512


// ****************************************************************************
//                pow template, single precision
// ****************************************************************************

// Template parameters:
// VTYPE:  data vector type
// ITYPE:  signed integer vector type
// BTYPE:  boolean vector type
// Calculate x to the power of y
template <class VTYPE, class ITYPE, class BTYPE>
static inline VTYPE pow_template_f(VTYPE const & x0, VTYPE const & y) {

    // define constants
    const float ln2f_hi  =  0.693359375f;
    const float ln2f_lo  = -2.12194440e-4f;
    //const float max_expf =  87.3f;
    const float log2e    =  float(VM_LOG2E);     // 1/log(2)
    const float pow2_23  =  8388608.0f;          // 2^23

    const float P0logf  =  3.3333331174E-1f;
    const float P1logf  = -2.4999993993E-1f;
    const float P2logf  =  2.0000714765E-1f;
    const float P3logf  = -1.6668057665E-1f;
    const float P4logf  =  1.4249322787E-1f;
    const float P5logf  = -1.2420140846E-1f;
    const float P6logf  =  1.1676998740E-1f;
    const float P7logf  = -1.1514610310E-1f;
    const float P8logf  =  7.0376836292E-2f;

    // Taylor coefficients for exp function, 1/n!
    const float p2expf   =  1.f/2.f;
    const float p3expf   =  1.f/6.f;
    const float p4expf   =  1.f/24.f;
    const float p5expf   =  1.f/120.f;
    const float p6expf   =  1.f/720.f;
    const float p7expf   =  1.f/5040.f;

    // data vectors
    VTYPE x, x1, x2;
    VTYPE ef, yr, v, z, z1;
    VTYPE lg, lg1;
    VTYPE lgerr, x2err;
    VTYPE e1, e2, e3, ee;
    // integer vectors
    ITYPE ei, ej, yodd;
    // boolean vectors
    BTYPE blend, xzero, xnegative;
    BTYPE overflow, underflow, xfinite, yfinite, efinite;

    // remove sign
    x1 = abs(x0);

    // Separate mantissa from exponent
    // This gives the mantissa * 0.5
    x  = fraction_2(x1);

    // reduce range of x = +/- sqrt(2)/2
    blend = x > float(VM_SQRT2 * 0.5);
    x  = if_add(!blend, x, x);                   // conditional add

    // Taylor expansion, high precision
    x   -= 1.0f;
    x2   = x * x;
    lg1  = polynomial_8(x, P0logf, P1logf, P2logf, P3logf, P4logf, P5logf, P6logf, P7logf, P8logf);
    lg1 *= x2 * x;

    // extract exponent
    ef = exponent_f(x1);
    ef = if_add(blend, ef, 1.0f);                // conditional add

    // multiply exponent by y
    // nearest integer e1 goes into exponent of result, remainder yr is added to log
    e1 = round(ef * y);
    yr = mul_sub_x(ef, y, e1);                   // calculate remainder yr. precision very important here

    // add initial terms to expansion
    lg = nmul_add(0.5f, x2, x) + lg1;            // lg = (x - 0.5f * x2) + lg1;

    // calculate rounding errors in lg
    // rounding error in multiplication 0.5*x*x
    x2err = mul_sub_x(0.5f*x, x, 0.5f * x2);
    // rounding error in additions and subtractions
    lgerr = mul_add(0.5f, x2, lg - x) - lg1;     // lgerr = ((lg - x) + 0.5f * x2) - lg1;

    // extract something for the exponent
    e2 = round(lg * y * float(VM_LOG2E));
    // subtract this from lg, with extra precision
    v = mul_sub_x(lg, y, e2 * ln2f_hi);
    v = nmul_add(e2, ln2f_lo, v);                // v -= e2 * ln2f_lo;

    // correct for previous rounding errors
    v -= mul_sub(lgerr + x2err, y, yr * float(VM_LN2)); // v -= (lgerr + x2err) * y - yr * float(VM_LN2) ;

    // exp function

    // extract something for the exponent if possible
    x = v;
    e3 = round(x*log2e);
    // high precision multiplication not needed here because abs(e3) <= 1
    x = nmul_add(e3, float(VM_LN2), x);          // x -= e3 * float(VM_LN2);

    // Taylor polynomial
    x2  = x  * x;
    z = polynomial_5(x, p2expf, p3expf, p4expf, p5expf, p6expf, p7expf)*x2 + x + 1.0f;

    // contributions to exponent
    ee = e1 + e2 + e3;
    ei = round_to_int(ee);
    // biased exponent of result:
    ej = ei + (ITYPE(reinterpret_i(z)) >> 23);
    // check exponent for overflow and underflow
    overflow  = BTYPE(ej >= 0x0FF) | (ee >  300.f);
    underflow = BTYPE(ej <= 0x000) | (ee < -300.f);

    // add exponent by integer addition
    z = reinterpret_f(ITYPE(reinterpret_i(z)) + (ei << 23)); // the extra 0x10000 is shifted out here

    // check for special cases
    xfinite   = is_finite(x0);
    yfinite   = is_finite(y);
    efinite   = is_finite(ee);
    xzero     = is_zero_or_subnormal(x0);
    xnegative = x0  < 0.f;

    // check for overflow and underflow
    if (horizontal_or(overflow | underflow)) {
        // handle errors
        z = select(underflow, VTYPE(0.f), z);
        z = select(overflow, infinite_vec<VTYPE>(), z);
    }

    // check for x == 0
    z = select(xzero, select(y < 0.f, infinite_vec<VTYPE>(), select(y == 0.f, VTYPE(1.), VTYPE(0.f))), z);

    // check for x < 0. y must be integer
    if (horizontal_or(xnegative)) {
        // test if y odd
        yodd = ITYPE(reinterpret_i(abs(y) + pow2_23)) << 31;     // convert y to integer and shift bit 0 to position of sign bit
        z1 = z | (x0 & VTYPE(reinterpret_f(yodd)));              // apply sign if y odd
        z1 = select(y == round(y), z1, nan_vec<VTYPE>(NAN_POW)); // NAN if y not integer
        z = select(xnegative, z1, z);
    }

    // check for range errors
    if (horizontal_and(xfinite & yfinite & efinite)) {
        // fast return if no special cases
        return z;
    }
    // handle special error cases
    z = select(yfinite & efinite, z, select(x1 == 1.f, VTYPE(1.f), select((x1 > 1.f) ^ sign_bit(y), infinite_vec<VTYPE>(), 0.f)));
    yodd = ITYPE(reinterpret_i(abs(y) + pow2_23)) << 31; // same as above
    z = select(xfinite, z, select(y == 0.f, VTYPE(1.f), select(y < 0.f, VTYPE(0.f), infinite_vec<VTYPE>() | (VTYPE(reinterpret_f(yodd)) & x0))));
    z = select(is_nan(x0), select(is_nan(y), x0 | y, x0), select(is_nan(y), y, z));
    return z;
}

//This template is in vectorf128.h to prevent implicit conversion of float y to int when float version is not defined:
//template <typename TT> static Vec4f pow(Vec4f const & a, TT n);

template <>
inline Vec4f pow<Vec4f const &>(Vec4f const & x, Vec4f const & y) {
    return pow_template_f<Vec4f, Vec4i, Vec4fb>(x, y);
}

template <>
inline Vec4f pow<float>(Vec4f const & x, float y) {
    return pow_template_f<Vec4f, Vec4i, Vec4fb>(x, y);
}

template <>
inline Vec4f pow<double>(Vec4f const & x, double y) {
    return pow_template_f<Vec4f, Vec4i, Vec4fb>(x, (float)y);
}

#if MAX_VECTOR_SIZE >= 256

template <>
inline Vec8f pow<Vec8f const &>(Vec8f const & x, Vec8f const & y) {
    return pow_template_f<Vec8f, Vec8i,  Vec8fb>(x, y);
}

template <>
inline Vec8f pow<float>(Vec8f const & x, float y) {
    return pow_template_f<Vec8f, Vec8i,  Vec8fb>(x, y);
}
template <>
inline Vec8f pow<double>(Vec8f const & x, double y) {
    return pow_template_f<Vec8f, Vec8i,  Vec8fb>(x, (float)y);
}

#endif // MAX_VECTOR_SIZE >= 256

#if MAX_VECTOR_SIZE >= 512

inline Vec16f pow(Vec16f const & x, Vec16f const & y) {
    return pow_template_f<Vec16f, Vec16i,  Vec16fb>(x, y);
}

inline Vec16f pow(Vec16f const & x, float y) {
    return pow_template_f<Vec16f, Vec16i,  Vec16fb>(x, y);
}

inline Vec16f pow(Vec16f const & x, double y) {
    return pow_template_f<Vec16f, Vec16i,  Vec16fb>(x, (float)y);
}

#endif // MAX_VECTOR_SIZE >= 512


// *************************************************************
//             power function with rational exponent
// *************************************************************
// Power function with rational exponent: x^(a/b)
// Template must be defined as class to allow partial template specialization
template <int a, int b>
class Power_rational {
public:
    // overloaded member function for each vector type
    static Vec4f pow(Vec4f const & x) {
        Vec4f y = x;
        // negative x allowed when b odd or a even
        // (if a is even then either b is odd or a/b can be reduced,
        // but we can check a even anyway at no cost to be sure)
        if (a == 0) return 1.f;
        if ((b | ~a) & 1) y = abs(y);
        y = ::pow(y, float(double(a)/double(b)));
        if (a & b & 1) y = sign_combine(y, x);          // apply sign if a and b both odd
        if ((a ^ b) >= 0) y = select(x == 0.f, 0.f, y); // zero allowed for positive a and b
        return y;
    }
    static Vec2d pow(Vec2d const & x) {
        Vec2d y = x;
        if (a == 0) return 1.;
        if ((b | ~a) & 1) y = abs(y);
        y = ::pow(y, double((long double)a/(long double)b));
        if (a & b & 1) y = sign_combine(y, x);
        if ((a ^ b) >= 0) y = select(x == 0., 0., y);
        return y;
    }
#if MAX_VECTOR_SIZE >= 256
    static Vec8f pow(Vec8f const & x) {
        Vec8f y = x;
        if (a == 0) return 1.f;
        if ((b | ~a) & 1) y = abs(y);
        y = ::pow(y, float(double(a)/double(b)));
        if (a & b & 1) y = sign_combine(y, x);
        if ((a ^ b) >= 0) y = select(x == 0.f, 0.f, y);
        return y;
    }
    static Vec4d pow(Vec4d const & x) {
        Vec4d y = x;
        if (a == 0) return 1.;
        if ((b | ~a) & 1) y = abs(y);
        y = ::pow(y, double((long double)a/(long double)b));
        if (a & b & 1) y = sign_combine(y, x);
        if ((a ^ b) >= 0) y = select(x == 0., 0., y);
        return y;
    }
#endif // MAX_VECTOR_SIZE >= 256
#if MAX_VECTOR_SIZE >= 512
    static Vec16f pow(Vec16f const & x) {
        Vec16f y = x;
        if (a == 0) return 1.f;
        if ((b | ~a) & 1) y = abs(y);
        y = ::pow(y, float(double(a)/double(b)));
        if (a & b & 1) y = sign_combine(y, x);
        if ((a ^ b) >= 0) y = select(x == 0.f, 0.f, y);
        return y;
    }
    static Vec8d pow(Vec8d const & x) {
        Vec8d y = x;
        if (a == 0) return 1.;
        if ((b | ~a) & 1) y = abs(y);
        y = ::pow(y, double((long double)a/(long double)b));
        if (a & b & 1) y = sign_combine(y, x);
        if ((a ^ b) >= 0) y = select(x == 0., 0., y);
        return y;
    }
#endif // MAX_VECTOR_SIZE >= 512
};

// partial specialization for b = 0
template<int a>
class Power_rational<a,0> {
public:
    template<class VTYPE>
    static VTYPE pow(VTYPE const & x) {return nan_vec<VTYPE>(NAN_LOG);}
};

// partial specialization for b = 1
template<int a>
class Power_rational<a,1> {
public:
    template<class VTYPE>
    static VTYPE pow(VTYPE const & x) {return pow_n<a>(x);}
};

// partial specialization for b = 2
template<int a>
class Power_rational<a,2> {
public:
    template<class VTYPE>
    static VTYPE pow(VTYPE const & x) {
        VTYPE y = pow_n<(a > 0 ? a/2 : (a-1)/2)>(x);
        if (a & 1) y *= sqrt(x);
        return y;
    }
};

// full specialization for a = 1, b = 2
template<>
class Power_rational<1,2> {
public:
    template<class VTYPE>
    static VTYPE pow(VTYPE const & x) {
        return sqrt(x);
    }
};

// full specialization for a = -1, b = 2
template<>
class Power_rational<-1,2> {
public:
    template<class VTYPE>
    static VTYPE pow(VTYPE const & x) {
        // (this is faster than iteration method on modern CPUs)
        return VTYPE(1.f) / sqrt(x);
    }
};

// partial specialization for b = 3
template<int a>
class Power_rational<a,3> {
public:
    template<class VTYPE>
    static VTYPE pow(VTYPE const & x) {
        VTYPE t;
        switch (a % 3) {
        case -2:
            t = reciprocal_cbrt(x);
            t *= t;
            if (a == -2) return t;
            t = t / pow_n<(-a-2)/3>(x);
            break;
        case -1:
            t = reciprocal_cbrt(x);
            if (a == -1) return t;
            t = t / pow_n<(-a-1)/3>(x);
            break;
        case  0:
            t = pow_n<a/3>(x);
            break;
        case  1:
            t = cbrt(x);
            if (a == 1) return t;
            t = t * pow_n<a/3>(x);
            break;
        case  2:
            t = square_cbrt(x);
            if (a == 2) return t;
            t = t * pow_n<a/3>(x);
            break;
        }
        return t;
    }
};

// partial specialization for b = 4
template<int a>
class Power_rational<a,4> {
public:
    template<class VTYPE>
    static VTYPE pow(VTYPE const & x) {
        VTYPE t, s1, s2;
        s1 = sqrt(x);
        if (a & 1) s2 = sqrt(s1);
        switch (a % 4) {
        case -3:
            t = s2 / pow_n<1+(-a)/4>(x);
            break;
        case -2:
            t = s1 / pow_n<1+(-a)/4>(x);
            break;
        case -1:
            if (a != -1) s2 *= pow_n<(-a)/4>(x);
            t = VTYPE(1.f) / s2;
            break;
        case  0: default:
            t = pow_n<a/4>(x);
            break;
        case  1:
            t = s2;
            if (a != 1) t *= pow_n<a/4>(x);
            break;
        case  2:
            t = s1;
            if (a != 2) t *= pow_n<a/4>(x);
            break;
        case  3:
            t = s1 * s2;
            if (a != 3) t *= pow_n<a/4>(x);
            break;
        }
        return t;
    }
};

// partial specialization for b = 6
template<int a>
class Power_rational<a,6> {
public:
    template<class VTYPE>
    static VTYPE pow(VTYPE const & x) {
        VTYPE t, s1, s2, s3;
        switch (a % 6) {
        case -5:
            t = reciprocal_cbrt(x);
            t = t * t * sqrt(t);
            if (a != -5) t /= pow_n<(-a)/6>(x);
            break;
        case -4:
            t = reciprocal_cbrt(x);
            t *= t;
            if (a != -4) t /= pow_n<(-a)/6>(x);
            break;
        case -3:
            t = pow_n<a/6>(x);
            t /= sqrt(x);
            break;
        case -2:
            t = reciprocal_cbrt(x);
            if (a != -2) t /= pow_n<(-a)/6>(x);
            break;
        case -1:
            t = sqrt(reciprocal_cbrt(x));
            if (a != -1) t /= pow_n<(-a)/6>(x);
            break;
        case  0: default:
            t = pow_n<a/6>(x);
            break;
        case  1:
            t = sqrt(cbrt(x));
            if (a != 1) t *= pow_n<a/6>(x);
            break;
        case  2:
            t = cbrt(x);
            if (a != 2) t *= pow_n<a/6>(x);
            break;
        case  3:
            t = sqrt(x);
            if (a != 3) t *= pow_n<a/6>(x);
            break;
        case  4:
            t = square_cbrt(x);
            if (a != 4) t *= pow_n<a/6>(x);
            break;
        case  5:
            t = cbrt(x);
            t = t * t * sqrt(t);
            if (a != 5) t *= pow_n<a/6>(x);
            break;
        }
        return t;
    }
};

// partial specialization for b = 8
template<int a>
class Power_rational<a,8> {
public:
    template<class VTYPE>
    static VTYPE pow(VTYPE const & x) {
        VTYPE t, s1, s2, s3;
        s1 = sqrt(x);               // x^(1/2)
        if (a & 3) s2 = sqrt(s1);   // x^(1/4)
        if (a & 1) s3 = sqrt(s2);   // x^(1/8)
        switch (a % 8) {
        case -7:
            t = s3 / pow_n<1+(-a)/8>(x);
            break;
        case -6:
            t = s2 / pow_n<1+(-a)/8>(x);
            break;
        case -5:
            t = s3 * (s2 / pow_n<1+(-a)/8>(x));
            break;
        case -4:
            t = s1 / pow_n<1+(-a)/8>(x);
            break;
        case -3:
            t = s3 * (s1 / pow_n<1+(-a)/8>(x));
            break;
        case -2:
            if (a != -2) s2 *= pow_n<(-a)/8>(x);
            t = VTYPE(1.f) / s2;
            break;
        case -1:
            if (a != -1) s3 *= pow_n<(-a)/8>(x);
            t = VTYPE(1.f) / s3;
            break;
        case  0: default:
            t = pow_n<a/8>(x);
            break;
        case  1:
            t = s3;
            if (a != 1) t *= pow_n<a/8>(x);
            break;
        case  2:
            t = s2;
            if (a != 2) t *= pow_n<a/8>(x);
            break;
        case  3:
            t = s2 * s3;
            if (a != 3) t *= pow_n<a/8>(x);
            break;
        case  4:
            t = s1;
            if (a != 4) t *= pow_n<a/8>(x);
            break;
        case  5:
            t = s1 * s3;
            if (a != 5) t *= pow_n<a/8>(x);
            break;
        case  6:
            t = s1 * s2;
            if (a != 6) t *= pow_n<a/8>(x);
            break;
        case  7:
            t = s2 * s3;
            if (a != 7) s1 *= pow_n<a/8>(x);
            t *= s1;
            break;

        }
        return t;
    }
};

// macro to call template class member function pow
#define pow_ratio(x, a, b) (Power_rational<(b)<0 ? -(a):(a), (b)<0 ? -(b):(b)> ().pow(x))


/******************************************************************************
*                 Detect NAN codes
*
* These functions return the code hidden in a NAN. The sign bit is ignored
******************************************************************************/

Vec4i nan_code(Vec4f const & x) {
    Vec4i  a = reinterpret_i(x);
    Vec4ib b = (a & 0x7F800000) == 0x7F800000;   // check if NAN/INF
    return a & 0x007FFFFF & Vec4i(b);            // isolate NAN code bits
}

// This function returns the code hidden in a NAN. The sign bit is ignored
Vec2q nan_code(Vec2d const & x) {
    Vec2q  a = reinterpret_i(x);
    Vec2q const m = 0x7FF0000000000000;
    Vec2q const n = 0x000FFFFFFFFFFFFF;
    Vec2qb b = (a & m) == m;                     // check if NAN/INF
    return a & n & Vec2q(b);                     // isolate NAN code bits
}

#if MAX_VECTOR_SIZE >= 256

// This function returns the code hidden in a NAN. The sign bit is ignored
Vec8i nan_code(Vec8f const & x) {
    Vec8i  a = reinterpret_i(x);
    Vec8ib b = (a & 0x7F800000) == 0x7F800000;   // check if NAN/INF
    return a & 0x007FFFFF & Vec8i(b);            // isolate NAN code bits
}

// This function returns the code hidden in a NAN. The sign bit is ignored
Vec4q nan_code(Vec4d const & x) {
    Vec4q  a = reinterpret_i(x);
    Vec4q const m = 0x7FF0000000000000;
    Vec4q const n = 0x000FFFFFFFFFFFFF;
    Vec4qb b = (a & m) == m;                     // check if NAN/INF
    return a & n & Vec4q(b);                     // isolate NAN code bits
}

#endif // MAX_VECTOR_SIZE >= 256
#if MAX_VECTOR_SIZE >= 512

// This function returns the code hidden in a NAN. The sign bit is ignored
Vec16i nan_code(Vec16f const & x) {
    Vec16i  a = Vec16i(reinterpret_i(x));
    Vec16ib b = (a & 0x7F800000) == 0x7F800000;  // check if NAN/INF
    return a & 0x007FFFFF & Vec16i(b);           // isolate NAN code bits
}

// This function returns the code hidden in a NAN. The sign bit is ignored
Vec8q nan_code(Vec8d const & x) {
    Vec8q  a = Vec8q(reinterpret_i(x));
    Vec8q const m = 0x7FF0000000000000;
    Vec8q const n = 0x000FFFFFFFFFFFFF;
    Vec8qb b = (a & m) == m;                     // check if NAN/INF
    return a & n & Vec8q(b);                     // isolate NAN code bits
}

#endif // MAX_VECTOR_SIZE >= 512

#endif  // VECTORMATH_EXP_H