mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-12-15 10:52:30 +00:00
1251 lines
46 KiB
Common Lisp
1251 lines
46 KiB
Common Lisp
// Copyright (c) 2009-2011 Intel Corporation
|
|
// All rights reserved.
|
|
//
|
|
// WARRANTY DISCLAIMER
|
|
//
|
|
// THESE MATERIALS ARE PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
|
|
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
|
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
|
|
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THESE
|
|
// MATERIALS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
//
|
|
// Intel Corporation is the author of the Materials, and requests that all
|
|
// problem reports or change requests be submitted to it directly
|
|
|
|
static const char * bitonic_sort_kernels = R"(
|
|
|
|
uchar4 makeMask_uchar(uchar4 srcLeft, uchar4 srcRight, uint4 indexLeft, uint4 indexRight)
|
|
{
|
|
uchar4 mask = convert_uchar4(srcLeft < srcRight);
|
|
|
|
if (srcLeft.x == srcRight.x)
|
|
mask.x = (indexLeft.x < indexRight.x) ? UCHAR_MAX : 0;
|
|
if (srcLeft.y == srcRight.y)
|
|
mask.y = (indexLeft.y < indexRight.y) ? UCHAR_MAX : 0;
|
|
if (srcLeft.z == srcRight.z)
|
|
mask.z = (indexLeft.z < indexRight.z) ? UCHAR_MAX : 0;
|
|
if (srcLeft.w == srcRight.w)
|
|
mask.w = (indexLeft.w < indexRight.w) ? UCHAR_MAX : 0;
|
|
|
|
return mask;
|
|
}
|
|
|
|
|
|
char4 makeMask_char(char4 srcLeft, char4 srcRight, uint4 indexLeft, uint4 indexRight)
|
|
{
|
|
char4 mask = srcLeft < srcRight;
|
|
|
|
if (srcLeft.x == srcRight.x)
|
|
mask.x = -(indexLeft.x < indexRight.x);
|
|
if (srcLeft.y == srcRight.y)
|
|
mask.y = -(indexLeft.y < indexRight.y);
|
|
if (srcLeft.z == srcRight.z)
|
|
mask.z = -(indexLeft.z < indexRight.z);
|
|
if (srcLeft.w == srcRight.w)
|
|
mask.w = -(indexLeft.w < indexRight.w);
|
|
|
|
return mask;
|
|
}
|
|
|
|
|
|
ushort4 makeMask_ushort(ushort4 srcLeft, ushort4 srcRight, uint4 indexLeft, uint4 indexRight)
|
|
{
|
|
ushort4 mask = convert_ushort4(srcLeft < srcRight);
|
|
|
|
if (srcLeft.x == srcRight.x)
|
|
mask.x = (indexLeft.x < indexRight.x) ? USHRT_MAX : 0;
|
|
if (srcLeft.y == srcRight.y)
|
|
mask.y = (indexLeft.y < indexRight.y) ? USHRT_MAX : 0;
|
|
if (srcLeft.z == srcRight.z)
|
|
mask.z = (indexLeft.z < indexRight.z) ? USHRT_MAX : 0;
|
|
if (srcLeft.w == srcRight.w)
|
|
mask.w = (indexLeft.w < indexRight.w) ? USHRT_MAX : 0;
|
|
|
|
return mask;
|
|
}
|
|
|
|
|
|
short4 makeMask_short(short4 srcLeft, short4 srcRight, uint4 indexLeft, uint4 indexRight)
|
|
{
|
|
short4 mask = srcLeft < srcRight;
|
|
|
|
if (srcLeft.x == srcRight.x)
|
|
mask.x = -(indexLeft.x < indexRight.x);
|
|
if (srcLeft.y == srcRight.y)
|
|
mask.y = -(indexLeft.y < indexRight.y);
|
|
if (srcLeft.z == srcRight.z)
|
|
mask.z = -(indexLeft.z < indexRight.z);
|
|
if (srcLeft.w == srcRight.w)
|
|
mask.w = -(indexLeft.w < indexRight.w);
|
|
|
|
return mask;
|
|
}
|
|
|
|
|
|
uint4 makeMask_uint(uint4 srcLeft, uint4 srcRight, uint4 indexLeft, uint4 indexRight)
|
|
{
|
|
uint4 mask = convert_uint4(srcLeft < srcRight);
|
|
|
|
if (srcLeft.x == srcRight.x)
|
|
mask.x = (indexLeft.x < indexRight.x) ? UINT_MAX : 0;
|
|
if (srcLeft.y == srcRight.y)
|
|
mask.y = (indexLeft.y < indexRight.y) ? UINT_MAX : 0;
|
|
if (srcLeft.z == srcRight.z)
|
|
mask.z = (indexLeft.z < indexRight.z) ? UINT_MAX : 0;
|
|
if (srcLeft.w == srcRight.w)
|
|
mask.w = (indexLeft.w < indexRight.w) ? UINT_MAX : 0;
|
|
|
|
return mask;
|
|
}
|
|
|
|
|
|
int4 makeMask_int(int4 srcLeft, int4 srcRight, uint4 indexLeft, uint4 indexRight)
|
|
{
|
|
int4 mask = srcLeft < srcRight;
|
|
|
|
if (srcLeft.x == srcRight.x)
|
|
mask.x = -(indexLeft.x < indexRight.x);
|
|
if (srcLeft.y == srcRight.y)
|
|
mask.y = -(indexLeft.y < indexRight.y);
|
|
if (srcLeft.z == srcRight.z)
|
|
mask.z = -(indexLeft.z < indexRight.z);
|
|
if (srcLeft.w == srcRight.w)
|
|
mask.w = -(indexLeft.w < indexRight.w);
|
|
|
|
return mask;
|
|
}
|
|
|
|
|
|
ulong4 makeMask_ulong(ulong4 srcLeft, ulong4 srcRight, uint4 indexLeft, uint4 indexRight)
|
|
{
|
|
ulong4 mask = convert_ulong4(srcLeft < srcRight);
|
|
|
|
if (srcLeft.x == srcRight.x)
|
|
mask.x = (indexLeft.x < indexRight.x) ? ULONG_MAX : 0;
|
|
if (srcLeft.y == srcRight.y)
|
|
mask.y = (indexLeft.y < indexRight.y) ? ULONG_MAX : 0;
|
|
if (srcLeft.z == srcRight.z)
|
|
mask.z = (indexLeft.z < indexRight.z) ? ULONG_MAX : 0;
|
|
if (srcLeft.w == srcRight.w)
|
|
mask.w = (indexLeft.w < indexRight.w) ? ULONG_MAX : 0;
|
|
|
|
return mask;
|
|
}
|
|
|
|
|
|
long4 makeMask_long(long4 srcLeft, long4 srcRight, uint4 indexLeft, uint4 indexRight)
|
|
{
|
|
long4 mask = srcLeft < srcRight;
|
|
|
|
if (srcLeft.x == srcRight.x)
|
|
mask.x = -(indexLeft.x < indexRight.x);
|
|
if (srcLeft.y == srcRight.y)
|
|
mask.y = -(indexLeft.y < indexRight.y);
|
|
if (srcLeft.z == srcRight.z)
|
|
mask.z = -(indexLeft.z < indexRight.z);
|
|
if (srcLeft.w == srcRight.w)
|
|
mask.w = -(indexLeft.w < indexRight.w);
|
|
|
|
return mask;
|
|
}
|
|
|
|
|
|
__kernel void __attribute__((vec_type_hint(char4))) __attribute__((vec_type_hint(uint4))) bitonicSort_char(__global char4 * theArray,
|
|
__global uint4 * indices,
|
|
const uint stage,
|
|
const uint passOfStage,
|
|
const uint dir)
|
|
{
|
|
size_t i = get_global_id(0);
|
|
char4 srcLeft, srcRight, mask;
|
|
char4 imask10 = (char4)(0, 0, -1, -1);
|
|
char4 imask11 = (char4)(0, -1, 0, -1);
|
|
uint4 indexLeft, indexRight;
|
|
|
|
if (stage > 0)
|
|
{
|
|
if (passOfStage > 0) // upper level pass, exchange between two fours
|
|
{
|
|
size_t r = 1 << (passOfStage - 1); // length of arrow
|
|
size_t lmask = r - 1;
|
|
size_t left = ((i >> (passOfStage - 1)) << passOfStage) + (i & lmask);
|
|
size_t right = left + r;
|
|
|
|
srcLeft = theArray[left];
|
|
srcRight = theArray[right];
|
|
indexLeft = indices[left];
|
|
indexRight = indices[right];
|
|
|
|
mask = makeMask_char(srcLeft, srcRight, indexLeft, indexRight);
|
|
|
|
char4 imin = (srcLeft & mask) | (srcRight & ~mask);
|
|
char4 imax = (srcLeft & ~mask) | (srcRight & mask);
|
|
|
|
if (((i >> (stage - 1)) & 1) ^ dir)
|
|
{
|
|
theArray[left] = imin;
|
|
theArray[right] = imax;
|
|
|
|
indices[left] = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
|
|
indices[right] = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
|
|
}
|
|
else
|
|
{
|
|
theArray[right] = imin;
|
|
theArray[left] = imax;
|
|
|
|
indices[right] = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
|
|
indices[left] = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
|
|
}
|
|
}
|
|
else // last pass, sort inside one four
|
|
{
|
|
srcLeft = theArray[i];
|
|
srcRight = srcLeft.zwxy;
|
|
indexLeft = indices[i];
|
|
indexRight = indexLeft.zwxy;
|
|
|
|
mask = makeMask_char(srcLeft, srcRight, indexLeft, indexRight) ^ imask10;
|
|
|
|
if (((i >> stage) & 1) ^ dir)
|
|
{
|
|
srcLeft = (srcLeft & mask) | (srcRight & ~mask);
|
|
srcRight = srcLeft.yxwz;
|
|
indexLeft = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
|
|
indexRight = indexLeft.yxwz;
|
|
|
|
mask = makeMask_char(srcLeft, srcRight, indexLeft, indexRight) ^ imask11;
|
|
|
|
theArray[i] = (srcLeft & mask) | (srcRight & ~mask);
|
|
indices[i] = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
|
|
}
|
|
else
|
|
{
|
|
srcLeft = (srcLeft & ~mask) | (srcRight & mask);
|
|
srcRight = srcLeft.yxwz;
|
|
indexLeft = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
|
|
indexRight = indexLeft.yxwz;
|
|
|
|
mask = makeMask_char(srcLeft, srcRight, indexLeft, indexRight) ^ imask11;
|
|
|
|
theArray[i] = (srcLeft & ~mask) | (srcRight & mask);
|
|
indices[i] = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
|
|
}
|
|
}
|
|
}
|
|
else // first stage, sort inside one four
|
|
{
|
|
char4 imask0 = (char4)(0, -1, -1, 0);
|
|
|
|
srcLeft = theArray[i];
|
|
srcRight = srcLeft.yxwz;
|
|
indexLeft = indices[i];
|
|
indexRight = indexLeft.yxwz;
|
|
|
|
mask = makeMask_char(srcLeft, srcRight, indexLeft, indexRight) ^ imask0;
|
|
|
|
if (dir)
|
|
{
|
|
srcLeft = (srcLeft & mask) | (srcRight & ~mask);
|
|
indexLeft = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
|
|
} else
|
|
{
|
|
srcLeft = (srcLeft & ~mask) | (srcRight & mask);
|
|
indexLeft = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
|
|
}
|
|
|
|
srcRight = srcLeft.zwxy;
|
|
indexRight = indexLeft.zwxy;
|
|
mask = makeMask_char(srcLeft, srcRight, indexLeft, indexRight) ^ imask10;
|
|
|
|
if ((i & 1) ^ dir)
|
|
{
|
|
srcLeft = (srcLeft & mask) | (srcRight & ~mask);
|
|
srcRight = srcLeft.yxwz;
|
|
indexLeft = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
|
|
indexRight = indexLeft.yxwz;
|
|
|
|
mask = makeMask_char(srcLeft, srcRight, indexLeft, indexRight) ^ imask11;
|
|
|
|
theArray[i] = (srcLeft & mask) | (srcRight & ~mask);
|
|
indices[i] = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
|
|
}
|
|
else
|
|
{
|
|
srcLeft = (srcLeft & ~mask) | (srcRight & mask);
|
|
srcRight = srcLeft.yxwz;
|
|
indexLeft = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
|
|
indexRight = indexLeft.yxwz;
|
|
|
|
mask = makeMask_char(srcLeft, srcRight, indexLeft, indexRight) ^ imask11;
|
|
|
|
theArray[i] = (srcLeft & ~mask) | (srcRight & mask);
|
|
indices[i] = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
__kernel void /*__attribute__((vec_type_hint(uchar4)))*/ bitonicSort_uchar(__global uchar4 * theArray,
|
|
__global uint4 * indices,
|
|
const uint stage,
|
|
const uint passOfStage,
|
|
const uint dir)
|
|
{
|
|
size_t i = get_global_id(0);
|
|
uchar4 srcLeft, srcRight, mask;
|
|
uchar4 imask10 = (uchar4)(0, 0, -1, -1);
|
|
uchar4 imask11 = (uchar4)(0, -1, 0, -1);
|
|
uint4 indexLeft, indexRight;
|
|
|
|
if (stage > 0)
|
|
{
|
|
if (passOfStage > 0) // upper level pass, exchange between two fours
|
|
{
|
|
size_t r = 1 << (passOfStage - 1); // length of arrow
|
|
size_t lmask = r - 1;
|
|
size_t left = ((i >> (passOfStage - 1)) << passOfStage) + (i & lmask);
|
|
size_t right = left + r;
|
|
|
|
srcLeft = theArray[left];
|
|
srcRight = theArray[right];
|
|
indexLeft = indices[left];
|
|
indexRight = indices[right];
|
|
|
|
mask = makeMask_uchar(srcLeft, srcRight, indexLeft, indexRight);
|
|
|
|
uchar4 imin = (srcLeft & mask) | (srcRight & ~mask);
|
|
uchar4 imax = (srcLeft & ~mask) | (srcRight & mask);
|
|
|
|
if (((i >> (stage - 1)) & 1) ^ dir)
|
|
{
|
|
theArray[left] = imin;
|
|
theArray[right] = imax;
|
|
|
|
indices[left] = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
|
|
indices[right] = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
|
|
}
|
|
else
|
|
{
|
|
theArray[right] = imin;
|
|
theArray[left] = imax;
|
|
|
|
indices[right] = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
|
|
indices[left] = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
|
|
}
|
|
}
|
|
else // last pass, sort inside one four
|
|
{
|
|
srcLeft = theArray[i];
|
|
srcRight = srcLeft.zwxy;
|
|
indexLeft = indices[i];
|
|
indexRight = indexLeft.zwxy;
|
|
|
|
mask = makeMask_uchar(srcLeft, srcRight, indexLeft, indexRight) ^ imask10;
|
|
|
|
if (((i >> stage) & 1) ^ dir)
|
|
{
|
|
srcLeft = (srcLeft & mask) | (srcRight & ~mask);
|
|
srcRight = srcLeft.yxwz;
|
|
indexLeft = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
|
|
indexRight = indexLeft.yxwz;
|
|
|
|
mask = makeMask_uchar(srcLeft, srcRight, indexLeft, indexRight) ^ imask11;
|
|
|
|
theArray[i] = (srcLeft & mask) | (srcRight & ~mask);
|
|
indices[i] = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
|
|
}
|
|
else
|
|
{
|
|
srcLeft = (srcLeft & ~mask) | (srcRight & mask);
|
|
srcRight = srcLeft.yxwz;
|
|
indexLeft = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
|
|
indexRight = indexLeft.yxwz;
|
|
|
|
mask = makeMask_uchar(srcLeft, srcRight, indexLeft, indexRight) ^ imask11;
|
|
|
|
theArray[i] = (srcLeft & ~mask) | (srcRight & mask);
|
|
indices[i] = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
|
|
}
|
|
}
|
|
}
|
|
else // first stage, sort inside one four
|
|
{
|
|
uchar4 imask0 = (uchar4)(0, -1, -1, 0);
|
|
|
|
srcLeft = theArray[i];
|
|
srcRight = srcLeft.yxwz;
|
|
indexLeft = indices[i];
|
|
indexRight = indexLeft.yxwz;
|
|
|
|
mask = makeMask_uchar(srcLeft, srcRight, indexLeft, indexRight) ^ imask0;
|
|
|
|
if (dir) {
|
|
srcLeft = (srcLeft & mask) | (srcRight & ~mask);
|
|
indexLeft = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
|
|
} else {
|
|
srcLeft = (srcLeft & ~mask) | (srcRight & mask);
|
|
indexLeft = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
|
|
}
|
|
|
|
srcRight = srcLeft.zwxy;
|
|
indexRight = indexLeft.zwxy;
|
|
mask = makeMask_uchar(srcLeft, srcRight, indexLeft, indexRight) ^ imask10;
|
|
|
|
if ((i & 1) ^ dir)
|
|
{
|
|
srcLeft = (srcLeft & mask) | (srcRight & ~mask);
|
|
srcRight = srcLeft.yxwz;
|
|
indexLeft = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
|
|
indexRight = indexLeft.yxwz;
|
|
|
|
mask = makeMask_uchar(srcLeft, srcRight, indexLeft, indexRight) ^ imask11;
|
|
|
|
theArray[i] = (srcLeft & mask) | (srcRight & ~mask);
|
|
indices[i] = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
|
|
}
|
|
else
|
|
{
|
|
srcLeft = (srcLeft & ~mask) | (srcRight & mask);
|
|
srcRight = srcLeft.yxwz;
|
|
indexLeft = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
|
|
indexRight = indexLeft.yxwz;
|
|
|
|
mask = makeMask_uchar(srcLeft, srcRight, indexLeft, indexRight) ^ imask11;
|
|
|
|
theArray[i] = (srcLeft & ~mask) | (srcRight & mask);
|
|
indices[i] = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
__kernel void /*__attribute__((vec_type_hint(short4)))*/ bitonicSort_short(__global short4 * theArray,
|
|
__global uint4 * indices,
|
|
const uint stage,
|
|
const uint passOfStage,
|
|
const uint dir)
|
|
{
|
|
size_t i = get_global_id(0);
|
|
short4 srcLeft, srcRight, mask;
|
|
short4 imask10 = (short4)(0, 0, -1, -1);
|
|
short4 imask11 = (short4)(0, -1, 0, -1);
|
|
uint4 indexLeft, indexRight;
|
|
|
|
if (stage > 0)
|
|
{
|
|
if (passOfStage > 0) // upper level pass, exchange between two fours
|
|
{
|
|
size_t r = 1 << (passOfStage - 1); // length of arrow
|
|
size_t lmask = r - 1;
|
|
size_t left = ((i >> (passOfStage - 1)) << passOfStage) + (i & lmask);
|
|
size_t right = left + r;
|
|
|
|
srcLeft = theArray[left];
|
|
srcRight = theArray[right];
|
|
indexLeft = indices[left];
|
|
indexRight = indices[right];
|
|
|
|
mask = makeMask_short(srcLeft, srcRight, indexLeft, indexRight);
|
|
|
|
short4 imin = (srcLeft & mask) | (srcRight & ~mask);
|
|
short4 imax = (srcLeft & ~mask) | (srcRight & mask);
|
|
|
|
if (((i >> (stage - 1)) & 1) ^ dir)
|
|
{
|
|
theArray[left] = imin;
|
|
theArray[right] = imax;
|
|
|
|
indices[left] = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
|
|
indices[right] = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
|
|
}
|
|
else
|
|
{
|
|
theArray[right] = imin;
|
|
theArray[left] = imax;
|
|
|
|
indices[right] = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
|
|
indices[left] = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
|
|
}
|
|
}
|
|
else // last pass, sort inside one four
|
|
{
|
|
srcLeft = theArray[i];
|
|
srcRight = srcLeft.zwxy;
|
|
indexLeft = indices[i];
|
|
indexRight = indexLeft.zwxy;
|
|
|
|
mask = makeMask_short(srcLeft, srcRight, indexLeft, indexRight) ^ imask10;
|
|
|
|
if (((i >> stage) & 1) ^ dir)
|
|
{
|
|
srcLeft = (srcLeft & mask) | (srcRight & ~mask);
|
|
srcRight = srcLeft.yxwz;
|
|
indexLeft = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
|
|
indexRight = indexLeft.yxwz;
|
|
|
|
mask = makeMask_short(srcLeft, srcRight, indexLeft, indexRight) ^ imask11;
|
|
|
|
theArray[i] = (srcLeft & mask) | (srcRight & ~mask);
|
|
indices[i] = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
|
|
}
|
|
else
|
|
{
|
|
srcLeft = (srcLeft & ~mask) | (srcRight & mask);
|
|
srcRight = srcLeft.yxwz;
|
|
indexLeft = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
|
|
indexRight = indexLeft.yxwz;
|
|
|
|
mask = makeMask_short(srcLeft, srcRight, indexLeft, indexRight) ^ imask11;
|
|
|
|
theArray[i] = (srcLeft & ~mask) | (srcRight & mask);
|
|
indices[i] = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
|
|
}
|
|
}
|
|
}
|
|
else // first stage, sort inside one four
|
|
{
|
|
short4 imask0 = (short4)(0, -1, -1, 0);
|
|
|
|
srcLeft = theArray[i];
|
|
srcRight = srcLeft.yxwz;
|
|
indexLeft = indices[i];
|
|
indexRight = indexLeft.yxwz;
|
|
|
|
mask = makeMask_short(srcLeft, srcRight, indexLeft, indexRight) ^ imask0;
|
|
|
|
if (dir)
|
|
{
|
|
srcLeft = (srcLeft & mask) | (srcRight & ~mask);
|
|
indexLeft = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
|
|
}
|
|
else
|
|
{
|
|
srcLeft = (srcLeft & ~mask) | (srcRight & mask);
|
|
indexLeft = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
|
|
}
|
|
|
|
srcRight = srcLeft.zwxy;
|
|
indexRight = indexLeft.zwxy;
|
|
mask = makeMask_short(srcLeft, srcRight, indexLeft, indexRight) ^ imask10;
|
|
|
|
if ((i & 1) ^ dir)
|
|
{
|
|
srcLeft = (srcLeft & mask) | (srcRight & ~mask);
|
|
srcRight = srcLeft.yxwz;
|
|
indexLeft = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
|
|
indexRight = indexLeft.yxwz;
|
|
|
|
mask = makeMask_short(srcLeft, srcRight, indexLeft, indexRight) ^ imask11;
|
|
|
|
theArray[i] = (srcLeft & mask) | (srcRight & ~mask);
|
|
indices[i] = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
|
|
}
|
|
else
|
|
{
|
|
srcLeft = (srcLeft & ~mask) | (srcRight & mask);
|
|
srcRight = srcLeft.yxwz;
|
|
indexLeft = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
|
|
indexRight = indexLeft.yxwz;
|
|
|
|
mask = makeMask_short(srcLeft, srcRight, indexLeft, indexRight) ^ imask11;
|
|
|
|
theArray[i] = (srcLeft & ~mask) | (srcRight & mask);
|
|
indices[i] = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
__kernel void /*__attribute__((vec_type_hint(ushort4)))*/ bitonicSort_ushort(__global ushort4 * theArray,
|
|
__global uint4 * indices,
|
|
const uint stage,
|
|
const uint passOfStage,
|
|
const uint dir)
|
|
{
|
|
size_t i = get_global_id(0);
|
|
ushort4 srcLeft, srcRight, mask;
|
|
ushort4 imask10 = (ushort4)(0, 0, -1, -1);
|
|
ushort4 imask11 = (ushort4)(0, -1, 0, -1);
|
|
uint4 indexLeft, indexRight;
|
|
|
|
if (stage > 0)
|
|
{
|
|
if (passOfStage > 0) // upper level pass, exchange between two fours
|
|
{
|
|
size_t r = 1 << (passOfStage - 1); // length of arrow
|
|
size_t lmask = r - 1;
|
|
size_t left = ((i >> (passOfStage - 1)) << passOfStage) + (i & lmask);
|
|
size_t right = left + r;
|
|
|
|
srcLeft = theArray[left];
|
|
srcRight = theArray[right];
|
|
indexLeft = indices[left];
|
|
indexRight = indices[right];
|
|
|
|
mask = makeMask_ushort(srcLeft, srcRight, indexLeft, indexRight);
|
|
|
|
ushort4 imin = (srcLeft & mask) | (srcRight & ~mask);
|
|
ushort4 imax = (srcLeft & ~mask) | (srcRight & mask);
|
|
|
|
if (((i >> (stage - 1)) & 1) ^ dir)
|
|
{
|
|
theArray[left] = imin;
|
|
theArray[right] = imax;
|
|
|
|
indices[left] = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
|
|
indices[right] = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
|
|
}
|
|
else
|
|
{
|
|
theArray[right] = imin;
|
|
theArray[left] = imax;
|
|
|
|
indices[right] = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
|
|
indices[left] = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
|
|
}
|
|
}
|
|
else // last pass, sort inside one four
|
|
{
|
|
srcLeft = theArray[i];
|
|
srcRight = srcLeft.zwxy;
|
|
indexLeft = indices[i];
|
|
indexRight = indexLeft.zwxy;
|
|
|
|
mask = makeMask_ushort(srcLeft, srcRight, indexLeft, indexRight) ^ imask10;
|
|
|
|
if (((i >> stage) & 1) ^ dir)
|
|
{
|
|
srcLeft = (srcLeft & mask) | (srcRight & ~mask);
|
|
srcRight = srcLeft.yxwz;
|
|
indexLeft = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
|
|
indexRight = indexLeft.yxwz;
|
|
|
|
mask = makeMask_ushort(srcLeft, srcRight, indexLeft, indexRight) ^ imask11;
|
|
|
|
theArray[i] = (srcLeft & mask) | (srcRight & ~mask);
|
|
indices[i] = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
|
|
}
|
|
else
|
|
{
|
|
srcLeft = (srcLeft & ~mask) | (srcRight & mask);
|
|
srcRight = srcLeft.yxwz;
|
|
indexLeft = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
|
|
indexRight = indexLeft.yxwz;
|
|
|
|
mask = makeMask_ushort(srcLeft, srcRight, indexLeft, indexRight) ^ imask11;
|
|
|
|
theArray[i] = (srcLeft & ~mask) | (srcRight & mask);
|
|
indices[i] = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
|
|
}
|
|
}
|
|
}
|
|
else // first stage, sort inside one four
|
|
{
|
|
ushort4 imask0 = (ushort4)(0, -1, -1, 0);
|
|
|
|
srcLeft = theArray[i];
|
|
srcRight = srcLeft.yxwz;
|
|
indexLeft = indices[i];
|
|
indexRight = indexLeft.yxwz;
|
|
|
|
mask = makeMask_ushort(srcLeft, srcRight, indexLeft, indexRight) ^ imask0;
|
|
|
|
if (dir) {
|
|
srcLeft = (srcLeft & mask) | (srcRight & ~mask);
|
|
indexLeft = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
|
|
} else {
|
|
srcLeft = (srcLeft & ~mask) | (srcRight & mask);
|
|
indexLeft = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
|
|
}
|
|
|
|
srcRight = srcLeft.zwxy;
|
|
indexRight = indexLeft.zwxy;
|
|
mask = makeMask_ushort(srcLeft, srcRight, indexLeft, indexRight) ^ imask10;
|
|
|
|
if ((i & 1) ^ dir)
|
|
{
|
|
srcLeft = (srcLeft & mask) | (srcRight & ~mask);
|
|
srcRight = srcLeft.yxwz;
|
|
indexLeft = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
|
|
indexRight = indexLeft.yxwz;
|
|
|
|
mask = makeMask_ushort(srcLeft, srcRight, indexLeft, indexRight) ^ imask11;
|
|
|
|
theArray[i] = (srcLeft & mask) | (srcRight & ~mask);
|
|
indices[i] = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
|
|
}
|
|
else
|
|
{
|
|
srcLeft = (srcLeft & ~mask) | (srcRight & mask);
|
|
srcRight = srcLeft.yxwz;
|
|
indexLeft = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
|
|
indexRight = indexLeft.yxwz;
|
|
|
|
mask = makeMask_ushort(srcLeft, srcRight, indexLeft, indexRight) ^ imask11;
|
|
|
|
theArray[i] = (srcLeft & ~mask) | (srcRight & mask);
|
|
indices[i] = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
__kernel void /*__attribute__((vec_type_hint(int4)))*/ bitonicSort_int(__global int4 * theArray,
|
|
__global uint4 * indices,
|
|
const uint stage,
|
|
const uint passOfStage,
|
|
const uint dir)
|
|
{
|
|
|
|
size_t i = get_global_id(0);
|
|
int4 srcLeft, srcRight, mask;
|
|
int4 imask10 = (int4)(0, 0, -1, -1);
|
|
int4 imask11 = (int4)(0, -1, 0, -1);
|
|
uint4 indexLeft, indexRight;
|
|
|
|
int random_id = (i * 5 + stage * 42 + passOfStage * 47 + dir * 88) % 100;
|
|
|
|
if (stage > 0)
|
|
{
|
|
if (passOfStage > 0) // upper level pass, exchange between two fours
|
|
{
|
|
size_t r = 1 << (passOfStage - 1); // length of arrow
|
|
size_t lmask = r - 1;
|
|
size_t left = ((i >> (passOfStage - 1)) << passOfStage) + (i & lmask);
|
|
size_t right = left + r;
|
|
|
|
srcLeft = theArray[left];
|
|
srcRight = theArray[right];
|
|
indexLeft = indices[left];
|
|
indexRight = indices[right];
|
|
|
|
mask = makeMask_int(srcLeft, srcRight, indexLeft, indexRight);
|
|
|
|
int4 imin = (srcLeft & mask) | (srcRight & ~mask);
|
|
int4 imax = (srcLeft & ~mask) | (srcRight & mask);
|
|
|
|
if (((i >> (stage - 1)) & 1) ^ dir)
|
|
{
|
|
theArray[left] = imin;
|
|
theArray[right] = imax;
|
|
|
|
indices[left] = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
|
|
indices[right] = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
|
|
}
|
|
else
|
|
{
|
|
theArray[right] = imin;
|
|
theArray[left] = imax;
|
|
|
|
indices[right] = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
|
|
indices[left] = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
|
|
}
|
|
}
|
|
else // last pass, sort inside one four
|
|
{
|
|
srcLeft = theArray[i];
|
|
srcRight = srcLeft.zwxy;
|
|
indexLeft = indices[i];
|
|
indexRight = indexLeft.zwxy;
|
|
|
|
mask = makeMask_int(srcLeft, srcRight, indexLeft, indexRight) ^ imask10;
|
|
|
|
if (((i >> stage) & 1) ^ dir)
|
|
{
|
|
srcLeft = (srcLeft & mask) | (srcRight & ~mask);
|
|
srcRight = srcLeft.yxwz;
|
|
indexLeft = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
|
|
indexRight = indexLeft.yxwz;
|
|
|
|
mask = makeMask_int(srcLeft, srcRight, indexLeft, indexRight) ^ imask11;
|
|
|
|
theArray[i] = (srcLeft & mask) | (srcRight & ~mask);
|
|
indices[i] = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
|
|
}
|
|
else
|
|
{
|
|
srcLeft = (srcLeft & ~mask) | (srcRight & mask);
|
|
srcRight = srcLeft.yxwz;
|
|
indexLeft = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
|
|
indexRight = indexLeft.yxwz;
|
|
|
|
mask = makeMask_int(srcLeft, srcRight, indexLeft, indexRight) ^ imask11;
|
|
|
|
theArray[i] = (srcLeft & ~mask) | (srcRight & mask);
|
|
indices[i] = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
|
|
}
|
|
}
|
|
}
|
|
else // first stage, sort inside one four
|
|
{
|
|
int4 imask0 = (int4)(0, -1, -1, 0);
|
|
|
|
srcLeft = theArray[i];
|
|
srcRight = srcLeft.yxwz;
|
|
indexLeft = indices[i];
|
|
indexRight = indexLeft.yxwz;
|
|
|
|
mask = makeMask_int(srcLeft, srcRight, indexLeft, indexRight) ^ imask0;
|
|
|
|
if (dir)
|
|
{
|
|
srcLeft = (srcLeft & mask) | (srcRight & ~mask);
|
|
indexLeft = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
|
|
}
|
|
else
|
|
{
|
|
srcLeft = (srcLeft & ~mask) | (srcRight & mask);
|
|
indexLeft = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
|
|
}
|
|
|
|
srcRight = srcLeft.zwxy;
|
|
indexRight = indexLeft.zwxy;
|
|
mask = makeMask_int(srcLeft, srcRight, indexLeft, indexRight) ^ imask10;
|
|
|
|
if ((i & 1) ^ dir)
|
|
{
|
|
srcLeft = (srcLeft & mask) | (srcRight & ~mask);
|
|
srcRight = srcLeft.yxwz;
|
|
indexLeft = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
|
|
indexRight = indexLeft.yxwz;
|
|
|
|
mask = makeMask_int(srcLeft, srcRight, indexLeft, indexRight) ^ imask11;
|
|
|
|
theArray[i] = (srcLeft & mask) | (srcRight & ~mask);
|
|
indices[i] = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
|
|
}
|
|
else
|
|
{
|
|
srcLeft = (srcLeft & ~mask) | (srcRight & mask);
|
|
srcRight = srcLeft.yxwz;
|
|
indexLeft = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
|
|
indexRight = indexLeft.yxwz;
|
|
|
|
mask = makeMask_int(srcLeft, srcRight, indexLeft, indexRight) ^ imask11;
|
|
|
|
theArray[i] = (srcLeft & ~mask) | (srcRight & mask);
|
|
indices[i] = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
__kernel void /*__attribute__((vec_type_hint(uint4)))*/ bitonicSort_uint(__global uint4 * theArray,
|
|
__global uint4 * indices,
|
|
const uint stage,
|
|
const uint passOfStage,
|
|
const uint dir)
|
|
{
|
|
size_t i = get_global_id(0);
|
|
uint4 srcLeft, srcRight, mask;
|
|
uint4 imask10 = (uint4)(0, 0, -1, -1);
|
|
uint4 imask11 = (uint4)(0, -1, 0, -1);
|
|
uint4 indexLeft, indexRight;
|
|
|
|
if (stage > 0)
|
|
{
|
|
if (passOfStage > 0) // upper level pass, exchange between two fours
|
|
{
|
|
size_t r = 1 << (passOfStage - 1); // length of arrow
|
|
size_t lmask = r - 1;
|
|
size_t left = ((i >> (passOfStage - 1)) << passOfStage) + (i & lmask);
|
|
size_t right = left + r;
|
|
|
|
srcLeft = theArray[left];
|
|
srcRight = theArray[right];
|
|
indexLeft = indices[left];
|
|
indexRight = indices[right];
|
|
|
|
mask = makeMask_uint(srcLeft, srcRight, indexLeft, indexRight);
|
|
|
|
uint4 imin = (srcLeft & mask) | (srcRight & ~mask);
|
|
uint4 imax = (srcLeft & ~mask) | (srcRight & mask);
|
|
|
|
if (((i >> (stage - 1)) & 1) ^ dir)
|
|
{
|
|
theArray[left] = imin;
|
|
theArray[right] = imax;
|
|
|
|
indices[left] = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
|
|
indices[right] = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
|
|
}
|
|
else
|
|
{
|
|
theArray[right] = imin;
|
|
theArray[left] = imax;
|
|
|
|
indices[right] = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
|
|
indices[left] = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
|
|
}
|
|
}
|
|
else // last pass, sort inside one four
|
|
{
|
|
srcLeft = theArray[i];
|
|
srcRight = srcLeft.zwxy;
|
|
indexLeft = indices[i];
|
|
indexRight = indexLeft.zwxy;
|
|
|
|
mask = makeMask_uint(srcLeft, srcRight, indexLeft, indexRight) ^ imask10;
|
|
|
|
if (((i >> stage) & 1) ^ dir)
|
|
{
|
|
srcLeft = (srcLeft & mask) | (srcRight & ~mask);
|
|
srcRight = srcLeft.yxwz;
|
|
indexLeft = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
|
|
indexRight = indexLeft.yxwz;
|
|
|
|
mask = makeMask_uint(srcLeft, srcRight, indexLeft, indexRight) ^ imask11;
|
|
|
|
theArray[i] = (srcLeft & mask) | (srcRight & ~mask);
|
|
indices[i] = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
|
|
}
|
|
else
|
|
{
|
|
srcLeft = (srcLeft & ~mask) | (srcRight & mask);
|
|
srcRight = srcLeft.yxwz;
|
|
indexLeft = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
|
|
indexRight = indexLeft.yxwz;
|
|
|
|
mask = makeMask_uint(srcLeft, srcRight, indexLeft, indexRight) ^ imask11;
|
|
|
|
theArray[i] = (srcLeft & ~mask) | (srcRight & mask);
|
|
indices[i] = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
|
|
}
|
|
}
|
|
}
|
|
else // first stage, sort inside one four
|
|
{
|
|
uint4 imask0 = (uint4)(0, -1, -1, 0);
|
|
|
|
srcLeft = theArray[i];
|
|
srcRight = srcLeft.yxwz;
|
|
indexLeft = indices[i];
|
|
indexRight = indexLeft.yxwz;
|
|
|
|
mask = makeMask_uint(srcLeft, srcRight, indexLeft, indexRight) ^ imask0;
|
|
|
|
if (dir)
|
|
{
|
|
srcLeft = (srcLeft & mask) | (srcRight & ~mask);
|
|
indexLeft = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
|
|
}
|
|
else
|
|
{
|
|
srcLeft = (srcLeft & ~mask) | (srcRight & mask);
|
|
indexLeft = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
|
|
}
|
|
|
|
srcRight = srcLeft.zwxy;
|
|
indexRight = indexLeft.zwxy;
|
|
mask = makeMask_uint(srcLeft, srcRight, indexLeft, indexRight) ^ imask10;
|
|
|
|
if ((i & 1) ^ dir)
|
|
{
|
|
srcLeft = (srcLeft & mask) | (srcRight & ~mask);
|
|
srcRight = srcLeft.yxwz;
|
|
indexLeft = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
|
|
indexRight = indexLeft.yxwz;
|
|
|
|
mask = makeMask_uint(srcLeft, srcRight, indexLeft, indexRight) ^ imask11;
|
|
|
|
theArray[i] = (srcLeft & mask) | (srcRight & ~mask);
|
|
indices[i] = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
|
|
}
|
|
else
|
|
{
|
|
srcLeft = (srcLeft & ~mask) | (srcRight & mask);
|
|
srcRight = srcLeft.yxwz;
|
|
indexLeft = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
|
|
indexRight = indexLeft.yxwz;
|
|
|
|
mask = makeMask_uint(srcLeft, srcRight, indexLeft, indexRight) ^ imask11;
|
|
|
|
theArray[i] = (srcLeft & ~mask) | (srcRight & mask);
|
|
indices[i] = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
__kernel void /*__attribute__((vec_type_hint(long4)))*/ bitonicSort_long(__global long4 * theArray,
|
|
__global uint4 * indices,
|
|
const uint stage,
|
|
const uint passOfStage,
|
|
const uint dir)
|
|
{
|
|
size_t i = get_global_id(0);
|
|
long4 srcLeft, srcRight, mask;
|
|
long4 imask10 = (long4)(0, 0, -1, -1);
|
|
long4 imask11 = (long4)(0, -1, 0, -1);
|
|
uint4 indexLeft, indexRight;
|
|
|
|
if (stage > 0)
|
|
{
|
|
if (passOfStage > 0) // upper level pass, exchange between two fours
|
|
{
|
|
size_t r = 1 << (passOfStage - 1); // length of arrow
|
|
size_t lmask = r - 1;
|
|
size_t left = ((i >> (passOfStage - 1)) << passOfStage) + (i & lmask);
|
|
size_t right = left + r;
|
|
|
|
srcLeft = theArray[left];
|
|
srcRight = theArray[right];
|
|
indexLeft = indices[left];
|
|
indexRight = indices[right];
|
|
|
|
mask = makeMask_long(srcLeft, srcRight, indexLeft, indexRight);
|
|
|
|
long4 imin = (srcLeft & mask) | (srcRight & ~mask);
|
|
long4 imax = (srcLeft & ~mask) | (srcRight & mask);
|
|
|
|
if (((i >> (stage - 1)) & 1) ^ dir)
|
|
{
|
|
theArray[left] = imin;
|
|
theArray[right] = imax;
|
|
|
|
indices[left] = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
|
|
indices[right] = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
|
|
}
|
|
else
|
|
{
|
|
theArray[right] = imin;
|
|
theArray[left] = imax;
|
|
|
|
indices[right] = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
|
|
indices[left] = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
|
|
}
|
|
}
|
|
else // last pass, sort inside one four
|
|
{
|
|
srcLeft = theArray[i];
|
|
srcRight = srcLeft.zwxy;
|
|
indexLeft = indices[i];
|
|
indexRight = indexLeft.zwxy;
|
|
|
|
mask = makeMask_long(srcLeft, srcRight, indexLeft, indexRight) ^ imask10;
|
|
|
|
if (((i >> stage) & 1) ^ dir)
|
|
{
|
|
srcLeft = (srcLeft & mask) | (srcRight & ~mask);
|
|
srcRight = srcLeft.yxwz;
|
|
indexLeft = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
|
|
indexRight = indexLeft.yxwz;
|
|
|
|
mask = makeMask_long(srcLeft, srcRight, indexLeft, indexRight) ^ imask11;
|
|
|
|
theArray[i] = (srcLeft & mask) | (srcRight & ~mask);
|
|
indices[i] = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
|
|
}
|
|
else
|
|
{
|
|
srcLeft = (srcLeft & ~mask) | (srcRight & mask);
|
|
srcRight = srcLeft.yxwz;
|
|
indexLeft = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
|
|
indexRight = indexLeft.yxwz;
|
|
|
|
mask = makeMask_long(srcLeft, srcRight, indexLeft, indexRight) ^ imask11;
|
|
|
|
theArray[i] = (srcLeft & ~mask) | (srcRight & mask);
|
|
indices[i] = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
|
|
}
|
|
}
|
|
}
|
|
else // first stage, sort inside one four
|
|
{
|
|
long4 imask0 = (long4)(0, -1, -1, 0);
|
|
|
|
srcLeft = theArray[i];
|
|
srcRight = srcLeft.yxwz;
|
|
indexLeft = indices[i];
|
|
indexRight = indexLeft.yxwz;
|
|
|
|
mask = makeMask_long(srcLeft, srcRight, indexLeft, indexRight) ^ imask0;
|
|
|
|
if (dir)
|
|
{
|
|
srcLeft = (srcLeft & mask) | (srcRight & ~mask);
|
|
indexLeft = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
|
|
}
|
|
else
|
|
{
|
|
srcLeft = (srcLeft & ~mask) | (srcRight & mask);
|
|
indexLeft = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
|
|
}
|
|
|
|
srcRight = srcLeft.zwxy;
|
|
indexRight = indexLeft.zwxy;
|
|
mask = makeMask_long(srcLeft, srcRight, indexLeft, indexRight) ^ imask10;
|
|
|
|
if ((i & 1) ^ dir)
|
|
{
|
|
srcLeft = (srcLeft & mask) | (srcRight & ~mask);
|
|
srcRight = srcLeft.yxwz;
|
|
indexLeft = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
|
|
indexRight = indexLeft.yxwz;
|
|
|
|
mask = makeMask_long(srcLeft, srcRight, indexLeft, indexRight) ^ imask11;
|
|
|
|
theArray[i] = (srcLeft & mask) | (srcRight & ~mask);
|
|
indices[i] = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
|
|
}
|
|
else
|
|
{
|
|
srcLeft = (srcLeft & ~mask) | (srcRight & mask);
|
|
srcRight = srcLeft.yxwz;
|
|
indexLeft = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
|
|
indexRight = indexLeft.yxwz;
|
|
|
|
mask = makeMask_long(srcLeft, srcRight, indexLeft, indexRight) ^ imask11;
|
|
|
|
theArray[i] = (srcLeft & ~mask) | (srcRight & mask);
|
|
indices[i] = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
__kernel void /*__attribute__((vec_type_hint(ulong4)))*/ bitonicSort_ulong(__global ulong4 * theArray,
|
|
__global uint4 * indices,
|
|
const uint stage,
|
|
const uint passOfStage,
|
|
const uint dir)
|
|
{
|
|
size_t i = get_global_id(0);
|
|
ulong4 srcLeft, srcRight, mask;
|
|
ulong4 imask10 = (ulong4)(0, 0, -1, -1);
|
|
ulong4 imask11 = (ulong4)(0, -1, 0, -1);
|
|
uint4 indexLeft, indexRight;
|
|
|
|
if (stage > 0)
|
|
{
|
|
if (passOfStage > 0) // upper level pass, exchange between two fours
|
|
{
|
|
size_t r = 1 << (passOfStage - 1); // length of arrow
|
|
size_t lmask = r - 1;
|
|
size_t left = ((i >> (passOfStage - 1)) << passOfStage) + (i & lmask);
|
|
size_t right = left + r;
|
|
|
|
srcLeft = theArray[left];
|
|
srcRight = theArray[right];
|
|
indexLeft = indices[left];
|
|
indexRight = indices[right];
|
|
|
|
mask = makeMask_ulong(srcLeft, srcRight, indexLeft, indexRight);
|
|
|
|
ulong4 imin = (srcLeft & mask) | (srcRight & ~mask);
|
|
ulong4 imax = (srcLeft & ~mask) | (srcRight & mask);
|
|
|
|
if (((i >> (stage - 1)) & 1) ^ dir)
|
|
{
|
|
theArray[left] = imin;
|
|
theArray[right] = imax;
|
|
|
|
indices[left] = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
|
|
indices[right] = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
|
|
}
|
|
else
|
|
{
|
|
theArray[right] = imin;
|
|
theArray[left] = imax;
|
|
|
|
indices[right] = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
|
|
indices[left] = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
|
|
}
|
|
}
|
|
else // last pass, sort inside one four
|
|
{
|
|
srcLeft = theArray[i];
|
|
srcRight = srcLeft.zwxy;
|
|
indexLeft = indices[i];
|
|
indexRight = indexLeft.zwxy;
|
|
|
|
mask = makeMask_ulong(srcLeft, srcRight, indexLeft, indexRight) ^ imask10;
|
|
|
|
if (((i >> stage) & 1) ^ dir)
|
|
{
|
|
srcLeft = (srcLeft & mask) | (srcRight & ~mask);
|
|
srcRight = srcLeft.yxwz;
|
|
indexLeft = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
|
|
indexRight = indexLeft.yxwz;
|
|
|
|
mask = makeMask_ulong(srcLeft, srcRight, indexLeft, indexRight) ^ imask11;
|
|
|
|
theArray[i] = (srcLeft & mask) | (srcRight & ~mask);
|
|
indices[i] = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
|
|
}
|
|
else
|
|
{
|
|
srcLeft = (srcLeft & ~mask) | (srcRight & mask);
|
|
srcRight = srcLeft.yxwz;
|
|
indexLeft = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
|
|
indexRight = indexLeft.yxwz;
|
|
|
|
mask = makeMask_ulong(srcLeft, srcRight, indexLeft, indexRight) ^ imask11;
|
|
|
|
theArray[i] = (srcLeft & ~mask) | (srcRight & mask);
|
|
indices[i] = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
|
|
}
|
|
}
|
|
}
|
|
else // first stage, sort inside one four
|
|
{
|
|
ulong4 imask0 = (ulong4)(0, -1, -1, 0);
|
|
|
|
srcLeft = theArray[i];
|
|
srcRight = srcLeft.yxwz;
|
|
indexLeft = indices[i];
|
|
indexRight = indexLeft.yxwz;
|
|
|
|
mask = makeMask_ulong(srcLeft, srcRight, indexLeft, indexRight) ^ imask0;
|
|
|
|
if (dir)
|
|
{
|
|
srcLeft = (srcLeft & mask) | (srcRight & ~mask);
|
|
indexLeft = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
|
|
}
|
|
else
|
|
{
|
|
srcLeft = (srcLeft & ~mask) | (srcRight & mask);
|
|
indexLeft = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
|
|
}
|
|
|
|
srcRight = srcLeft.zwxy;
|
|
indexRight = indexLeft.zwxy;
|
|
mask = makeMask_ulong(srcLeft, srcRight, indexLeft, indexRight) ^ imask10;
|
|
|
|
if ((i & 1) ^ dir)
|
|
{
|
|
srcLeft = (srcLeft & mask) | (srcRight & ~mask);
|
|
srcRight = srcLeft.yxwz;
|
|
indexLeft = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
|
|
indexRight = indexLeft.yxwz;
|
|
|
|
mask = makeMask_ulong(srcLeft, srcRight, indexLeft, indexRight) ^ imask11;
|
|
|
|
theArray[i] = (srcLeft & mask) | (srcRight & ~mask);
|
|
indices[i] = (indexLeft & convert_uint4(mask)) | (indexRight & convert_uint4(~mask));
|
|
}
|
|
else
|
|
{
|
|
srcLeft = (srcLeft & ~mask) | (srcRight & mask);
|
|
srcRight = srcLeft.yxwz;
|
|
indexLeft = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
|
|
indexRight = indexLeft.yxwz;
|
|
|
|
mask = makeMask_ulong(srcLeft, srcRight, indexLeft, indexRight) ^ imask11;
|
|
|
|
theArray[i] = (srcLeft & ~mask) | (srcRight & mask);
|
|
indices[i] = (indexLeft & convert_uint4(~mask)) | (indexRight & convert_uint4(mask));
|
|
}
|
|
}
|
|
}
|
|
|
|
)";
|