ClickHouse/src/Common/bitonicSortKernels.cl

1251 lines
46 KiB
Common Lisp
Raw Normal View History

2020-05-15 00:01:14 +00:00
// Copyright (c) 2009-2011 Intel Corporation
// All rights reserved.
//
// WARRANTY DISCLAIMER
//
// THESE MATERIALS ARE PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THESE
// MATERIALS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Intel Corporation is the author of the Materials, and requests that all
// problem reports or change requests be submitted to it directly
static const char * bitonic_sort_kernels = R"(
uchar4 makeMask_uchar(uchar4 srcLeft, uchar4 srcRight, uint4 indexLeft, uint4 indexRight)
{
uchar4 mask = convert_uchar4(srcLeft < srcRight);
if (srcLeft.x == srcRight.x)
mask.x = (indexLeft.x < indexRight.x) ? UCHAR_MAX : 0;
if (srcLeft.y == srcRight.y)
mask.y = (indexLeft.y < indexRight.y) ? UCHAR_MAX : 0;
if (srcLeft.z == srcRight.z)
mask.z = (indexLeft.z < indexRight.z) ? UCHAR_MAX : 0;
if (srcLeft.w == srcRight.w)
mask.w = (indexLeft.w < indexRight.w) ? UCHAR_MAX : 0;
return mask;
}
char4 makeMask_char(char4 srcLeft, char4 srcRight, uint4 indexLeft, uint4 indexRight)
{
char4 mask = srcLeft < srcRight;
if (srcLeft.x == srcRight.x)
mask.x = -(indexLeft.x < indexRight.x);
if (srcLeft.y == srcRight.y)
mask.y = -(indexLeft.y < indexRight.y);
if (srcLeft.z == srcRight.z)
mask.z = -(indexLeft.z < indexRight.z);
if (srcLeft.w == srcRight.w)
mask.w = -(indexLeft.w < indexRight.w);
return mask;
}
ushort4 makeMask_ushort(ushort4 srcLeft, ushort4 srcRight, uint4 indexLeft, uint4 indexRight)
{
ushort4 mask = convert_ushort4(srcLeft < srcRight);
if (srcLeft.x == srcRight.x)
mask.x = (indexLeft.x < indexRight.x) ? USHRT_MAX : 0;
if (srcLeft.y == srcRight.y)
mask.y = (indexLeft.y < indexRight.y) ? USHRT_MAX : 0;
if (srcLeft.z == srcRight.z)
mask.z = (indexLeft.z < indexRight.z) ? USHRT_MAX : 0;
if (srcLeft.w == srcRight.w)
mask.w = (indexLeft.w < indexRight.w) ? USHRT_MAX : 0;
return mask;
}
short4 makeMask_short(short4 srcLeft, short4 srcRight, uint4 indexLeft, uint4 indexRight)
{
short4 mask = srcLeft < srcRight;
if (srcLeft.x == srcRight.x)
mask.x = -(indexLeft.x < indexRight.x);
if (srcLeft.y == srcRight.y)
mask.y = -(indexLeft.y < indexRight.y);
if (srcLeft.z == srcRight.z)
mask.z = -(indexLeft.z < indexRight.z);
if (srcLeft.w == srcRight.w)
mask.w = -(indexLeft.w < indexRight.w);
return mask;
}
uint4 makeMask_uint(uint4 srcLeft, uint4 srcRight, uint4 indexLeft, uint4 indexRight)
{
uint4 mask = convert_uint4(srcLeft < srcRight);
if (srcLeft.x == srcRight.x)
mask.x = (indexLeft.x < indexRight.x) ? UINT_MAX : 0;
if (srcLeft.y == srcRight.y)
mask.y = (indexLeft.y < indexRight.y) ? UINT_MAX : 0;
if (srcLeft.z == srcRight.z)
mask.z = (indexLeft.z < indexRight.z) ? UINT_MAX : 0;
if (srcLeft.w == srcRight.w)
mask.w = (indexLeft.w < indexRight.w) ? UINT_MAX : 0;
return mask;
}
int4 makeMask_int(int4 srcLeft, int4 srcRight, uint4 indexLeft, uint4 indexRight)
{
int4 mask = srcLeft < srcRight;
if (srcLeft.x == srcRight.x)
mask.x = -(indexLeft.x < indexRight.x);
if (srcLeft.y == srcRight.y)
mask.y = -(indexLeft.y < indexRight.y);
if (srcLeft.z == srcRight.z)
mask.z = -(indexLeft.z < indexRight.z);
if (srcLeft.w == srcRight.w)
mask.w = -(indexLeft.w < indexRight.w);
return mask;
}
ulong4 makeMask_ulong(ulong4 srcLeft, ulong4 srcRight, uint4 indexLeft, uint4 indexRight)
{
ulong4 mask = convert_ulong4(srcLeft < srcRight);
if (srcLeft.x == srcRight.x)
mask.x = (indexLeft.x < indexRight.x) ? ULONG_MAX : 0;
if (srcLeft.y == srcRight.y)
mask.y = (indexLeft.y < indexRight.y) ? ULONG_MAX : 0;
if (srcLeft.z == srcRight.z)
mask.z = (indexLeft.z < indexRight.z) ? ULONG_MAX : 0;
if (srcLeft.w == srcRight.w)
mask.w = (indexLeft.w < indexRight.w) ? ULONG_MAX : 0;
return mask;
}
long4 makeMask_long(long4 srcLeft, long4 srcRight, uint4 indexLeft, uint4 indexRight)
{
long4 mask = srcLeft < srcRight;
if (srcLeft.x == srcRight.x)
mask.x = -(indexLeft.x < indexRight.x);
if (srcLeft.y == srcRight.y)
mask.y = -(indexLeft.y < indexRight.y);
if (srcLeft.z == srcRight.z)
mask.z = -(indexLeft.z < indexRight.z);
if (srcLeft.w == srcRight.w)
mask.w = -(indexLeft.w < indexRight.w);
return mask;
}
__kernel void __attribute__((vec_type_hint(char4))) __attribute__((vec_type_hint(uint4))) bitonicSort_char(__global char4 * theArray,
__global uint4 * indices,
const uint stage,
const uint passOfStage,
const uint dir)
{
size_t i = get_global_id(0);
char4 srcLeft, srcRight, mask;
char4 imask10 = (char4)(0, 0, -1, -1);
char4 imask11 = (char4)(0, -1, 0, -1);
uint4 indexLeft, indexRight;
if (stage > 0)
{
if (passOfStage > 0) // upper level pass, exchange between two fours
{
size_t r = 1 << (passOfStage - 1); // length of arrow
size_t lmask = r - 1;
size_t left = ((i >> (passOfStage - 1)) << passOfStage) + (i & lmask);
size_t right = left + r;
srcLeft = theArray[left];
srcRight = theArray[right];
indexLeft = indices[left];
indexRight = indices[right];
mask = makeMask_char(srcLeft, srcRight, indexLeft, indexRight);
char4 imin = (srcLeft & mask) | (srcRight & ~mask);
char4 imax = (srcLeft & ~mask) | (srcRight & mask);
if (((i >> (stage - 1)) & 1) ^ dir)
{
theArray[left] = imin;
theArray[right] = imax;
indices[left] = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
indices[right] = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
}
else
{
theArray[right] = imin;
theArray[left] = imax;
indices[right] = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
indices[left] = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
}
}
else // last pass, sort inside one four
{
srcLeft = theArray[i];
srcRight = srcLeft.zwxy;
indexLeft = indices[i];
indexRight = indexLeft.zwxy;
mask = makeMask_char(srcLeft, srcRight, indexLeft, indexRight) ^ imask10;
if (((i >> stage) & 1) ^ dir)
{
srcLeft = (srcLeft & mask) | (srcRight & ~mask);
srcRight = srcLeft.yxwz;
indexLeft = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
indexRight = indexLeft.yxwz;
mask = makeMask_char(srcLeft, srcRight, indexLeft, indexRight) ^ imask11;
theArray[i] = (srcLeft & mask) | (srcRight & ~mask);
indices[i] = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
}
else
{
srcLeft = (srcLeft & ~mask) | (srcRight & mask);
srcRight = srcLeft.yxwz;
indexLeft = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
indexRight = indexLeft.yxwz;
mask = makeMask_char(srcLeft, srcRight, indexLeft, indexRight) ^ imask11;
theArray[i] = (srcLeft & ~mask) | (srcRight & mask);
indices[i] = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
}
}
}
else // first stage, sort inside one four
{
char4 imask0 = (char4)(0, -1, -1, 0);
srcLeft = theArray[i];
srcRight = srcLeft.yxwz;
indexLeft = indices[i];
indexRight = indexLeft.yxwz;
mask = makeMask_char(srcLeft, srcRight, indexLeft, indexRight) ^ imask0;
if (dir)
{
srcLeft = (srcLeft & mask) | (srcRight & ~mask);
indexLeft = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
} else
{
srcLeft = (srcLeft & ~mask) | (srcRight & mask);
indexLeft = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
}
srcRight = srcLeft.zwxy;
indexRight = indexLeft.zwxy;
mask = makeMask_char(srcLeft, srcRight, indexLeft, indexRight) ^ imask10;
if ((i & 1) ^ dir)
{
srcLeft = (srcLeft & mask) | (srcRight & ~mask);
srcRight = srcLeft.yxwz;
indexLeft = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
indexRight = indexLeft.yxwz;
mask = makeMask_char(srcLeft, srcRight, indexLeft, indexRight) ^ imask11;
theArray[i] = (srcLeft & mask) | (srcRight & ~mask);
indices[i] = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
}
else
{
srcLeft = (srcLeft & ~mask) | (srcRight & mask);
srcRight = srcLeft.yxwz;
indexLeft = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
indexRight = indexLeft.yxwz;
mask = makeMask_char(srcLeft, srcRight, indexLeft, indexRight) ^ imask11;
theArray[i] = (srcLeft & ~mask) | (srcRight & mask);
indices[i] = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
}
}
}
__kernel void /*__attribute__((vec_type_hint(uchar4)))*/ bitonicSort_uchar(__global uchar4 * theArray,
__global uint4 * indices,
const uint stage,
const uint passOfStage,
const uint dir)
{
size_t i = get_global_id(0);
uchar4 srcLeft, srcRight, mask;
uchar4 imask10 = (uchar4)(0, 0, -1, -1);
uchar4 imask11 = (uchar4)(0, -1, 0, -1);
uint4 indexLeft, indexRight;
if (stage > 0)
{
if (passOfStage > 0) // upper level pass, exchange between two fours
{
size_t r = 1 << (passOfStage - 1); // length of arrow
size_t lmask = r - 1;
size_t left = ((i >> (passOfStage - 1)) << passOfStage) + (i & lmask);
size_t right = left + r;
srcLeft = theArray[left];
srcRight = theArray[right];
indexLeft = indices[left];
indexRight = indices[right];
mask = makeMask_uchar(srcLeft, srcRight, indexLeft, indexRight);
uchar4 imin = (srcLeft & mask) | (srcRight & ~mask);
uchar4 imax = (srcLeft & ~mask) | (srcRight & mask);
if (((i >> (stage - 1)) & 1) ^ dir)
{
theArray[left] = imin;
theArray[right] = imax;
indices[left] = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
indices[right] = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
}
else
{
theArray[right] = imin;
theArray[left] = imax;
indices[right] = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
indices[left] = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
}
}
else // last pass, sort inside one four
{
srcLeft = theArray[i];
srcRight = srcLeft.zwxy;
indexLeft = indices[i];
indexRight = indexLeft.zwxy;
mask = makeMask_uchar(srcLeft, srcRight, indexLeft, indexRight) ^ imask10;
if (((i >> stage) & 1) ^ dir)
{
srcLeft = (srcLeft & mask) | (srcRight & ~mask);
srcRight = srcLeft.yxwz;
indexLeft = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
indexRight = indexLeft.yxwz;
mask = makeMask_uchar(srcLeft, srcRight, indexLeft, indexRight) ^ imask11;
theArray[i] = (srcLeft & mask) | (srcRight & ~mask);
indices[i] = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
}
else
{
srcLeft = (srcLeft & ~mask) | (srcRight & mask);
srcRight = srcLeft.yxwz;
indexLeft = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
indexRight = indexLeft.yxwz;
mask = makeMask_uchar(srcLeft, srcRight, indexLeft, indexRight) ^ imask11;
theArray[i] = (srcLeft & ~mask) | (srcRight & mask);
indices[i] = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
}
}
}
else // first stage, sort inside one four
{
uchar4 imask0 = (uchar4)(0, -1, -1, 0);
srcLeft = theArray[i];
srcRight = srcLeft.yxwz;
indexLeft = indices[i];
indexRight = indexLeft.yxwz;
mask = makeMask_uchar(srcLeft, srcRight, indexLeft, indexRight) ^ imask0;
if (dir) {
srcLeft = (srcLeft & mask) | (srcRight & ~mask);
indexLeft = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
} else {
srcLeft = (srcLeft & ~mask) | (srcRight & mask);
indexLeft = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
}
srcRight = srcLeft.zwxy;
indexRight = indexLeft.zwxy;
mask = makeMask_uchar(srcLeft, srcRight, indexLeft, indexRight) ^ imask10;
if ((i & 1) ^ dir)
{
srcLeft = (srcLeft & mask) | (srcRight & ~mask);
srcRight = srcLeft.yxwz;
indexLeft = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
indexRight = indexLeft.yxwz;
mask = makeMask_uchar(srcLeft, srcRight, indexLeft, indexRight) ^ imask11;
theArray[i] = (srcLeft & mask) | (srcRight & ~mask);
indices[i] = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
}
else
{
srcLeft = (srcLeft & ~mask) | (srcRight & mask);
srcRight = srcLeft.yxwz;
indexLeft = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
indexRight = indexLeft.yxwz;
mask = makeMask_uchar(srcLeft, srcRight, indexLeft, indexRight) ^ imask11;
theArray[i] = (srcLeft & ~mask) | (srcRight & mask);
indices[i] = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
}
}
}
__kernel void /*__attribute__((vec_type_hint(short4)))*/ bitonicSort_short(__global short4 * theArray,
__global uint4 * indices,
const uint stage,
const uint passOfStage,
const uint dir)
{
size_t i = get_global_id(0);
short4 srcLeft, srcRight, mask;
short4 imask10 = (short4)(0, 0, -1, -1);
short4 imask11 = (short4)(0, -1, 0, -1);
uint4 indexLeft, indexRight;
if (stage > 0)
{
if (passOfStage > 0) // upper level pass, exchange between two fours
{
size_t r = 1 << (passOfStage - 1); // length of arrow
size_t lmask = r - 1;
size_t left = ((i >> (passOfStage - 1)) << passOfStage) + (i & lmask);
size_t right = left + r;
srcLeft = theArray[left];
srcRight = theArray[right];
indexLeft = indices[left];
indexRight = indices[right];
mask = makeMask_short(srcLeft, srcRight, indexLeft, indexRight);
short4 imin = (srcLeft & mask) | (srcRight & ~mask);
short4 imax = (srcLeft & ~mask) | (srcRight & mask);
if (((i >> (stage - 1)) & 1) ^ dir)
{
theArray[left] = imin;
theArray[right] = imax;
indices[left] = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
indices[right] = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
}
else
{
theArray[right] = imin;
theArray[left] = imax;
indices[right] = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
indices[left] = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
}
}
else // last pass, sort inside one four
{
srcLeft = theArray[i];
srcRight = srcLeft.zwxy;
indexLeft = indices[i];
indexRight = indexLeft.zwxy;
mask = makeMask_short(srcLeft, srcRight, indexLeft, indexRight) ^ imask10;
if (((i >> stage) & 1) ^ dir)
{
srcLeft = (srcLeft & mask) | (srcRight & ~mask);
srcRight = srcLeft.yxwz;
indexLeft = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
indexRight = indexLeft.yxwz;
mask = makeMask_short(srcLeft, srcRight, indexLeft, indexRight) ^ imask11;
theArray[i] = (srcLeft & mask) | (srcRight & ~mask);
indices[i] = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
}
else
{
srcLeft = (srcLeft & ~mask) | (srcRight & mask);
srcRight = srcLeft.yxwz;
indexLeft = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
indexRight = indexLeft.yxwz;
mask = makeMask_short(srcLeft, srcRight, indexLeft, indexRight) ^ imask11;
theArray[i] = (srcLeft & ~mask) | (srcRight & mask);
indices[i] = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
}
}
}
else // first stage, sort inside one four
{
short4 imask0 = (short4)(0, -1, -1, 0);
srcLeft = theArray[i];
srcRight = srcLeft.yxwz;
indexLeft = indices[i];
indexRight = indexLeft.yxwz;
mask = makeMask_short(srcLeft, srcRight, indexLeft, indexRight) ^ imask0;
if (dir)
{
srcLeft = (srcLeft & mask) | (srcRight & ~mask);
indexLeft = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
}
else
{
srcLeft = (srcLeft & ~mask) | (srcRight & mask);
indexLeft = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
}
srcRight = srcLeft.zwxy;
indexRight = indexLeft.zwxy;
mask = makeMask_short(srcLeft, srcRight, indexLeft, indexRight) ^ imask10;
if ((i & 1) ^ dir)
{
srcLeft = (srcLeft & mask) | (srcRight & ~mask);
srcRight = srcLeft.yxwz;
indexLeft = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
indexRight = indexLeft.yxwz;
mask = makeMask_short(srcLeft, srcRight, indexLeft, indexRight) ^ imask11;
theArray[i] = (srcLeft & mask) | (srcRight & ~mask);
indices[i] = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
}
else
{
srcLeft = (srcLeft & ~mask) | (srcRight & mask);
srcRight = srcLeft.yxwz;
indexLeft = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
indexRight = indexLeft.yxwz;
mask = makeMask_short(srcLeft, srcRight, indexLeft, indexRight) ^ imask11;
theArray[i] = (srcLeft & ~mask) | (srcRight & mask);
indices[i] = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
}
}
}
__kernel void /*__attribute__((vec_type_hint(ushort4)))*/ bitonicSort_ushort(__global ushort4 * theArray,
__global uint4 * indices,
const uint stage,
const uint passOfStage,
const uint dir)
{
size_t i = get_global_id(0);
ushort4 srcLeft, srcRight, mask;
ushort4 imask10 = (ushort4)(0, 0, -1, -1);
ushort4 imask11 = (ushort4)(0, -1, 0, -1);
uint4 indexLeft, indexRight;
if (stage > 0)
{
if (passOfStage > 0) // upper level pass, exchange between two fours
{
size_t r = 1 << (passOfStage - 1); // length of arrow
size_t lmask = r - 1;
size_t left = ((i >> (passOfStage - 1)) << passOfStage) + (i & lmask);
size_t right = left + r;
srcLeft = theArray[left];
srcRight = theArray[right];
indexLeft = indices[left];
indexRight = indices[right];
mask = makeMask_ushort(srcLeft, srcRight, indexLeft, indexRight);
ushort4 imin = (srcLeft & mask) | (srcRight & ~mask);
ushort4 imax = (srcLeft & ~mask) | (srcRight & mask);
if (((i >> (stage - 1)) & 1) ^ dir)
{
theArray[left] = imin;
theArray[right] = imax;
indices[left] = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
indices[right] = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
}
else
{
theArray[right] = imin;
theArray[left] = imax;
indices[right] = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
indices[left] = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
}
}
else // last pass, sort inside one four
{
srcLeft = theArray[i];
srcRight = srcLeft.zwxy;
indexLeft = indices[i];
indexRight = indexLeft.zwxy;
mask = makeMask_ushort(srcLeft, srcRight, indexLeft, indexRight) ^ imask10;
if (((i >> stage) & 1) ^ dir)
{
srcLeft = (srcLeft & mask) | (srcRight & ~mask);
srcRight = srcLeft.yxwz;
indexLeft = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
indexRight = indexLeft.yxwz;
mask = makeMask_ushort(srcLeft, srcRight, indexLeft, indexRight) ^ imask11;
theArray[i] = (srcLeft & mask) | (srcRight & ~mask);
indices[i] = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
}
else
{
srcLeft = (srcLeft & ~mask) | (srcRight & mask);
srcRight = srcLeft.yxwz;
indexLeft = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
indexRight = indexLeft.yxwz;
mask = makeMask_ushort(srcLeft, srcRight, indexLeft, indexRight) ^ imask11;
theArray[i] = (srcLeft & ~mask) | (srcRight & mask);
indices[i] = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
}
}
}
else // first stage, sort inside one four
{
ushort4 imask0 = (ushort4)(0, -1, -1, 0);
srcLeft = theArray[i];
srcRight = srcLeft.yxwz;
indexLeft = indices[i];
indexRight = indexLeft.yxwz;
mask = makeMask_ushort(srcLeft, srcRight, indexLeft, indexRight) ^ imask0;
if (dir) {
srcLeft = (srcLeft & mask) | (srcRight & ~mask);
indexLeft = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
} else {
srcLeft = (srcLeft & ~mask) | (srcRight & mask);
indexLeft = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
}
srcRight = srcLeft.zwxy;
indexRight = indexLeft.zwxy;
mask = makeMask_ushort(srcLeft, srcRight, indexLeft, indexRight) ^ imask10;
if ((i & 1) ^ dir)
{
srcLeft = (srcLeft & mask) | (srcRight & ~mask);
srcRight = srcLeft.yxwz;
indexLeft = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
indexRight = indexLeft.yxwz;
mask = makeMask_ushort(srcLeft, srcRight, indexLeft, indexRight) ^ imask11;
theArray[i] = (srcLeft & mask) | (srcRight & ~mask);
indices[i] = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
}
else
{
srcLeft = (srcLeft & ~mask) | (srcRight & mask);
srcRight = srcLeft.yxwz;
indexLeft = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
indexRight = indexLeft.yxwz;
mask = makeMask_ushort(srcLeft, srcRight, indexLeft, indexRight) ^ imask11;
theArray[i] = (srcLeft & ~mask) | (srcRight & mask);
indices[i] = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
}
}
}
__kernel void /*__attribute__((vec_type_hint(int4)))*/ bitonicSort_int(__global int4 * theArray,
__global uint4 * indices,
const uint stage,
const uint passOfStage,
const uint dir)
{
size_t i = get_global_id(0);
int4 srcLeft, srcRight, mask;
int4 imask10 = (int4)(0, 0, -1, -1);
int4 imask11 = (int4)(0, -1, 0, -1);
uint4 indexLeft, indexRight;
int random_id = (i * 5 + stage * 42 + passOfStage * 47 + dir * 88) % 100;
if (stage > 0)
{
if (passOfStage > 0) // upper level pass, exchange between two fours
{
size_t r = 1 << (passOfStage - 1); // length of arrow
size_t lmask = r - 1;
size_t left = ((i >> (passOfStage - 1)) << passOfStage) + (i & lmask);
size_t right = left + r;
srcLeft = theArray[left];
srcRight = theArray[right];
indexLeft = indices[left];
indexRight = indices[right];
mask = makeMask_int(srcLeft, srcRight, indexLeft, indexRight);
int4 imin = (srcLeft & mask) | (srcRight & ~mask);
int4 imax = (srcLeft & ~mask) | (srcRight & mask);
if (((i >> (stage - 1)) & 1) ^ dir)
{
theArray[left] = imin;
theArray[right] = imax;
indices[left] = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
indices[right] = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
}
else
{
theArray[right] = imin;
theArray[left] = imax;
indices[right] = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
indices[left] = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
}
}
else // last pass, sort inside one four
{
srcLeft = theArray[i];
srcRight = srcLeft.zwxy;
indexLeft = indices[i];
indexRight = indexLeft.zwxy;
mask = makeMask_int(srcLeft, srcRight, indexLeft, indexRight) ^ imask10;
if (((i >> stage) & 1) ^ dir)
{
srcLeft = (srcLeft & mask) | (srcRight & ~mask);
srcRight = srcLeft.yxwz;
indexLeft = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
indexRight = indexLeft.yxwz;
mask = makeMask_int(srcLeft, srcRight, indexLeft, indexRight) ^ imask11;
theArray[i] = (srcLeft & mask) | (srcRight & ~mask);
indices[i] = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
}
else
{
srcLeft = (srcLeft & ~mask) | (srcRight & mask);
srcRight = srcLeft.yxwz;
indexLeft = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
indexRight = indexLeft.yxwz;
mask = makeMask_int(srcLeft, srcRight, indexLeft, indexRight) ^ imask11;
theArray[i] = (srcLeft & ~mask) | (srcRight & mask);
indices[i] = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
}
}
}
else // first stage, sort inside one four
{
int4 imask0 = (int4)(0, -1, -1, 0);
srcLeft = theArray[i];
srcRight = srcLeft.yxwz;
indexLeft = indices[i];
indexRight = indexLeft.yxwz;
mask = makeMask_int(srcLeft, srcRight, indexLeft, indexRight) ^ imask0;
if (dir)
{
srcLeft = (srcLeft & mask) | (srcRight & ~mask);
indexLeft = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
}
else
{
srcLeft = (srcLeft & ~mask) | (srcRight & mask);
indexLeft = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
}
srcRight = srcLeft.zwxy;
indexRight = indexLeft.zwxy;
mask = makeMask_int(srcLeft, srcRight, indexLeft, indexRight) ^ imask10;
if ((i & 1) ^ dir)
{
srcLeft = (srcLeft & mask) | (srcRight & ~mask);
srcRight = srcLeft.yxwz;
indexLeft = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
indexRight = indexLeft.yxwz;
mask = makeMask_int(srcLeft, srcRight, indexLeft, indexRight) ^ imask11;
theArray[i] = (srcLeft & mask) | (srcRight & ~mask);
indices[i] = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
}
else
{
srcLeft = (srcLeft & ~mask) | (srcRight & mask);
srcRight = srcLeft.yxwz;
indexLeft = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
indexRight = indexLeft.yxwz;
mask = makeMask_int(srcLeft, srcRight, indexLeft, indexRight) ^ imask11;
theArray[i] = (srcLeft & ~mask) | (srcRight & mask);
indices[i] = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
}
}
}
__kernel void /*__attribute__((vec_type_hint(uint4)))*/ bitonicSort_uint(__global uint4 * theArray,
__global uint4 * indices,
const uint stage,
const uint passOfStage,
const uint dir)
{
size_t i = get_global_id(0);
uint4 srcLeft, srcRight, mask;
uint4 imask10 = (uint4)(0, 0, -1, -1);
uint4 imask11 = (uint4)(0, -1, 0, -1);
uint4 indexLeft, indexRight;
if (stage > 0)
{
if (passOfStage > 0) // upper level pass, exchange between two fours
{
size_t r = 1 << (passOfStage - 1); // length of arrow
size_t lmask = r - 1;
size_t left = ((i >> (passOfStage - 1)) << passOfStage) + (i & lmask);
size_t right = left + r;
srcLeft = theArray[left];
srcRight = theArray[right];
indexLeft = indices[left];
indexRight = indices[right];
mask = makeMask_uint(srcLeft, srcRight, indexLeft, indexRight);
uint4 imin = (srcLeft & mask) | (srcRight & ~mask);
uint4 imax = (srcLeft & ~mask) | (srcRight & mask);
if (((i >> (stage - 1)) & 1) ^ dir)
{
theArray[left] = imin;
theArray[right] = imax;
indices[left] = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
indices[right] = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
}
else
{
theArray[right] = imin;
theArray[left] = imax;
indices[right] = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
indices[left] = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
}
}
else // last pass, sort inside one four
{
srcLeft = theArray[i];
srcRight = srcLeft.zwxy;
indexLeft = indices[i];
indexRight = indexLeft.zwxy;
mask = makeMask_uint(srcLeft, srcRight, indexLeft, indexRight) ^ imask10;
if (((i >> stage) & 1) ^ dir)
{
srcLeft = (srcLeft & mask) | (srcRight & ~mask);
srcRight = srcLeft.yxwz;
indexLeft = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
indexRight = indexLeft.yxwz;
mask = makeMask_uint(srcLeft, srcRight, indexLeft, indexRight) ^ imask11;
theArray[i] = (srcLeft & mask) | (srcRight & ~mask);
indices[i] = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
}
else
{
srcLeft = (srcLeft & ~mask) | (srcRight & mask);
srcRight = srcLeft.yxwz;
indexLeft = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
indexRight = indexLeft.yxwz;
mask = makeMask_uint(srcLeft, srcRight, indexLeft, indexRight) ^ imask11;
theArray[i] = (srcLeft & ~mask) | (srcRight & mask);
indices[i] = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
}
}
}
else // first stage, sort inside one four
{
uint4 imask0 = (uint4)(0, -1, -1, 0);
srcLeft = theArray[i];
srcRight = srcLeft.yxwz;
indexLeft = indices[i];
indexRight = indexLeft.yxwz;
mask = makeMask_uint(srcLeft, srcRight, indexLeft, indexRight) ^ imask0;
if (dir)
{
srcLeft = (srcLeft & mask) | (srcRight & ~mask);
indexLeft = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
}
else
{
srcLeft = (srcLeft & ~mask) | (srcRight & mask);
indexLeft = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
}
srcRight = srcLeft.zwxy;
indexRight = indexLeft.zwxy;
mask = makeMask_uint(srcLeft, srcRight, indexLeft, indexRight) ^ imask10;
if ((i & 1) ^ dir)
{
srcLeft = (srcLeft & mask) | (srcRight & ~mask);
srcRight = srcLeft.yxwz;
indexLeft = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
indexRight = indexLeft.yxwz;
mask = makeMask_uint(srcLeft, srcRight, indexLeft, indexRight) ^ imask11;
theArray[i] = (srcLeft & mask) | (srcRight & ~mask);
indices[i] = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
}
else
{
srcLeft = (srcLeft & ~mask) | (srcRight & mask);
srcRight = srcLeft.yxwz;
indexLeft = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
indexRight = indexLeft.yxwz;
mask = makeMask_uint(srcLeft, srcRight, indexLeft, indexRight) ^ imask11;
theArray[i] = (srcLeft & ~mask) | (srcRight & mask);
indices[i] = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
}
}
}
__kernel void /*__attribute__((vec_type_hint(long4)))*/ bitonicSort_long(__global long4 * theArray,
__global uint4 * indices,
const uint stage,
const uint passOfStage,
const uint dir)
{
size_t i = get_global_id(0);
long4 srcLeft, srcRight, mask;
long4 imask10 = (long4)(0, 0, -1, -1);
long4 imask11 = (long4)(0, -1, 0, -1);
uint4 indexLeft, indexRight;
if (stage > 0)
{
if (passOfStage > 0) // upper level pass, exchange between two fours
{
size_t r = 1 << (passOfStage - 1); // length of arrow
size_t lmask = r - 1;
size_t left = ((i >> (passOfStage - 1)) << passOfStage) + (i & lmask);
size_t right = left + r;
srcLeft = theArray[left];
srcRight = theArray[right];
indexLeft = indices[left];
indexRight = indices[right];
mask = makeMask_long(srcLeft, srcRight, indexLeft, indexRight);
long4 imin = (srcLeft & mask) | (srcRight & ~mask);
long4 imax = (srcLeft & ~mask) | (srcRight & mask);
if (((i >> (stage - 1)) & 1) ^ dir)
{
theArray[left] = imin;
theArray[right] = imax;
indices[left] = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
indices[right] = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
}
else
{
theArray[right] = imin;
theArray[left] = imax;
indices[right] = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
indices[left] = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
}
}
else // last pass, sort inside one four
{
srcLeft = theArray[i];
srcRight = srcLeft.zwxy;
indexLeft = indices[i];
indexRight = indexLeft.zwxy;
mask = makeMask_long(srcLeft, srcRight, indexLeft, indexRight) ^ imask10;
if (((i >> stage) & 1) ^ dir)
{
srcLeft = (srcLeft & mask) | (srcRight & ~mask);
srcRight = srcLeft.yxwz;
indexLeft = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
indexRight = indexLeft.yxwz;
mask = makeMask_long(srcLeft, srcRight, indexLeft, indexRight) ^ imask11;
theArray[i] = (srcLeft & mask) | (srcRight & ~mask);
indices[i] = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
}
else
{
srcLeft = (srcLeft & ~mask) | (srcRight & mask);
srcRight = srcLeft.yxwz;
indexLeft = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
indexRight = indexLeft.yxwz;
mask = makeMask_long(srcLeft, srcRight, indexLeft, indexRight) ^ imask11;
theArray[i] = (srcLeft & ~mask) | (srcRight & mask);
indices[i] = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
}
}
}
else // first stage, sort inside one four
{
long4 imask0 = (long4)(0, -1, -1, 0);
srcLeft = theArray[i];
srcRight = srcLeft.yxwz;
indexLeft = indices[i];
indexRight = indexLeft.yxwz;
mask = makeMask_long(srcLeft, srcRight, indexLeft, indexRight) ^ imask0;
if (dir)
{
srcLeft = (srcLeft & mask) | (srcRight & ~mask);
indexLeft = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
}
else
{
srcLeft = (srcLeft & ~mask) | (srcRight & mask);
indexLeft = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
}
srcRight = srcLeft.zwxy;
indexRight = indexLeft.zwxy;
mask = makeMask_long(srcLeft, srcRight, indexLeft, indexRight) ^ imask10;
if ((i & 1) ^ dir)
{
srcLeft = (srcLeft & mask) | (srcRight & ~mask);
srcRight = srcLeft.yxwz;
indexLeft = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
indexRight = indexLeft.yxwz;
mask = makeMask_long(srcLeft, srcRight, indexLeft, indexRight) ^ imask11;
theArray[i] = (srcLeft & mask) | (srcRight & ~mask);
indices[i] = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
}
else
{
srcLeft = (srcLeft & ~mask) | (srcRight & mask);
srcRight = srcLeft.yxwz;
indexLeft = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
indexRight = indexLeft.yxwz;
mask = makeMask_long(srcLeft, srcRight, indexLeft, indexRight) ^ imask11;
theArray[i] = (srcLeft & ~mask) | (srcRight & mask);
indices[i] = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
}
}
}
__kernel void /*__attribute__((vec_type_hint(ulong4)))*/ bitonicSort_ulong(__global ulong4 * theArray,
__global uint4 * indices,
const uint stage,
const uint passOfStage,
const uint dir)
{
size_t i = get_global_id(0);
ulong4 srcLeft, srcRight, mask;
ulong4 imask10 = (ulong4)(0, 0, -1, -1);
ulong4 imask11 = (ulong4)(0, -1, 0, -1);
uint4 indexLeft, indexRight;
if (stage > 0)
{
if (passOfStage > 0) // upper level pass, exchange between two fours
{
size_t r = 1 << (passOfStage - 1); // length of arrow
size_t lmask = r - 1;
size_t left = ((i >> (passOfStage - 1)) << passOfStage) + (i & lmask);
size_t right = left + r;
srcLeft = theArray[left];
srcRight = theArray[right];
indexLeft = indices[left];
indexRight = indices[right];
mask = makeMask_ulong(srcLeft, srcRight, indexLeft, indexRight);
ulong4 imin = (srcLeft & mask) | (srcRight & ~mask);
ulong4 imax = (srcLeft & ~mask) | (srcRight & mask);
if (((i >> (stage - 1)) & 1) ^ dir)
{
theArray[left] = imin;
theArray[right] = imax;
indices[left] = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
indices[right] = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
}
else
{
theArray[right] = imin;
theArray[left] = imax;
indices[right] = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
indices[left] = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
}
}
else // last pass, sort inside one four
{
srcLeft = theArray[i];
srcRight = srcLeft.zwxy;
indexLeft = indices[i];
indexRight = indexLeft.zwxy;
mask = makeMask_ulong(srcLeft, srcRight, indexLeft, indexRight) ^ imask10;
if (((i >> stage) & 1) ^ dir)
{
srcLeft = (srcLeft & mask) | (srcRight & ~mask);
srcRight = srcLeft.yxwz;
indexLeft = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
indexRight = indexLeft.yxwz;
mask = makeMask_ulong(srcLeft, srcRight, indexLeft, indexRight) ^ imask11;
theArray[i] = (srcLeft & mask) | (srcRight & ~mask);
indices[i] = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
}
else
{
srcLeft = (srcLeft & ~mask) | (srcRight & mask);
srcRight = srcLeft.yxwz;
indexLeft = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
indexRight = indexLeft.yxwz;
mask = makeMask_ulong(srcLeft, srcRight, indexLeft, indexRight) ^ imask11;
theArray[i] = (srcLeft & ~mask) | (srcRight & mask);
indices[i] = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
}
}
}
else // first stage, sort inside one four
{
ulong4 imask0 = (ulong4)(0, -1, -1, 0);
srcLeft = theArray[i];
srcRight = srcLeft.yxwz;
indexLeft = indices[i];
indexRight = indexLeft.yxwz;
mask = makeMask_ulong(srcLeft, srcRight, indexLeft, indexRight) ^ imask0;
if (dir)
{
srcLeft = (srcLeft & mask) | (srcRight & ~mask);
indexLeft = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
}
else
{
srcLeft = (srcLeft & ~mask) | (srcRight & mask);
indexLeft = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
}
srcRight = srcLeft.zwxy;
indexRight = indexLeft.zwxy;
mask = makeMask_ulong(srcLeft, srcRight, indexLeft, indexRight) ^ imask10;
if ((i & 1) ^ dir)
{
srcLeft = (srcLeft & mask) | (srcRight & ~mask);
srcRight = srcLeft.yxwz;
indexLeft = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
indexRight = indexLeft.yxwz;
mask = makeMask_ulong(srcLeft, srcRight, indexLeft, indexRight) ^ imask11;
theArray[i] = (srcLeft & mask) | (srcRight & ~mask);
indices[i] = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
}
else
{
srcLeft = (srcLeft & ~mask) | (srcRight & mask);
srcRight = srcLeft.yxwz;
indexLeft = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
indexRight = indexLeft.yxwz;
mask = makeMask_ulong(srcLeft, srcRight, indexLeft, indexRight) ^ imask11;
theArray[i] = (srcLeft & ~mask) | (srcRight & mask);
indices[i] = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
}
}
}
)";