mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-12-16 11:22:12 +00:00
222 lines
8.5 KiB
C++
222 lines
8.5 KiB
C++
#pragma once
|
|
|
|
#include <string.h>
|
|
#if !defined(__APPLE__) && !defined(__FreeBSD__)
|
|
#include <malloc.h>
|
|
#endif
|
|
|
|
#ifdef __APPLE__
|
|
#include <OpenCL/opencl.h>
|
|
#else
|
|
#include <CL/cl.h>
|
|
#endif
|
|
|
|
#include <ext/bit_cast.h>
|
|
#include <Core/Types.h>
|
|
#include <Core/Defines.h>
|
|
#include <Common/PODArray.h>
|
|
#include <Columns/ColumnsCommon.h>
|
|
|
|
#include "oclBasics.h"
|
|
#include "bitonicSortKernels.cl"
|
|
|
|
class BitonicSort
|
|
{
|
|
public:
|
|
using KernelType = OCL::KernelType;
|
|
|
|
enum Types
|
|
{
|
|
KernelInt8 = 0,
|
|
KernelUInt8,
|
|
KernelInt16,
|
|
KernelUInt16,
|
|
KernelInt32,
|
|
KernelUInt32,
|
|
KernelInt64,
|
|
KernelUInt64,
|
|
KernelMax
|
|
};
|
|
|
|
static BitonicSort & getInstance()
|
|
{
|
|
static BitonicSort instance = BitonicSort();
|
|
return instance;
|
|
}
|
|
|
|
/// Sorts given array in specified order. Returns `true` if given sequence was sorted, `false` otherwise.
|
|
template <typename T>
|
|
bool sort(const DB::PaddedPODArray<T> & data, DB::IColumn::Permutation & res, cl_uint sort_ascending [[maybe_unused]]) const
|
|
{
|
|
if constexpr (
|
|
std::is_same_v<T, Int8> ||
|
|
std::is_same_v<T, UInt8> ||
|
|
std::is_same_v<T, Int16> ||
|
|
std::is_same_v<T, UInt16> ||
|
|
std::is_same_v<T, Int32> ||
|
|
std::is_same_v<T, UInt32> ||
|
|
std::is_same_v<T, Int64> ||
|
|
std::is_same_v<T, UInt64>)
|
|
{
|
|
size_t data_size = data.size();
|
|
|
|
/// Getting the nearest power of 2.
|
|
size_t power = 8;
|
|
while (power < data_size)
|
|
power <<= 1;
|
|
|
|
/// Allocates more space for additional stubs to be added if needed.
|
|
std::vector<T> pairs_content(power);
|
|
std::vector<UInt32> pairs_indices(power);
|
|
|
|
memcpy(&pairs_content[0], &data[0], sizeof(T) * data_size);
|
|
for (UInt32 i = 0; i < data_size; ++i)
|
|
pairs_indices[i] = i;
|
|
|
|
fillWithStubs(pairs_content.data(), pairs_indices.data(), data_size, power - data_size, sort_ascending);
|
|
sort(pairs_content.data(), pairs_indices.data(), power, sort_ascending);
|
|
|
|
for (size_t i = 0, shift = 0; i < power; ++i)
|
|
{
|
|
if (pairs_indices[i] >= data_size)
|
|
{
|
|
++shift;
|
|
continue;
|
|
}
|
|
res[i - shift] = pairs_indices[i];
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
/// Creating a configuration instance with making all OpenCl required variables
|
|
/// such as device, platform, context, queue, program and kernel.
|
|
void configure()
|
|
{
|
|
OCL::Settings settings = OCL::Settings(1, nullptr, 1, nullptr, 1, 0);
|
|
|
|
cl_platform_id platform = OCL::getPlatformID(settings);
|
|
cl_device_id device = OCL::getDeviceID(platform, settings);
|
|
cl_context gpu_context = OCL::makeContext(device, settings);
|
|
cl_command_queue command_queue = OCL::makeCommandQueue<2>(device, gpu_context, settings);
|
|
|
|
cl_program program = OCL::makeProgram(bitonic_sort_kernels, gpu_context, device, settings);
|
|
|
|
/// Creating kernels for each specified data type.
|
|
cl_int error = 0;
|
|
kernels.resize(KernelMax);
|
|
|
|
kernels[KernelInt8] = std::shared_ptr<KernelType>(clCreateKernel(program, "bitonicSort_char", &error), clReleaseKernel);
|
|
OCL::checkError(error);
|
|
|
|
kernels[KernelUInt8] = std::shared_ptr<KernelType>(clCreateKernel(program, "bitonicSort_uchar", &error), clReleaseKernel);
|
|
OCL::checkError(error);
|
|
|
|
kernels[KernelInt16] = std::shared_ptr<KernelType>(clCreateKernel(program, "bitonicSort_short", &error), clReleaseKernel);
|
|
OCL::checkError(error);
|
|
|
|
kernels[KernelUInt16] = std::shared_ptr<KernelType>(clCreateKernel(program, "bitonicSort_ushort", &error), clReleaseKernel);
|
|
OCL::checkError(error);
|
|
|
|
kernels[KernelInt32] = std::shared_ptr<KernelType>(clCreateKernel(program, "bitonicSort_int", &error), clReleaseKernel);
|
|
OCL::checkError(error);
|
|
|
|
kernels[KernelUInt32] = std::shared_ptr<KernelType>(clCreateKernel(program, "bitonicSort_uint", &error), clReleaseKernel);
|
|
OCL::checkError(error);
|
|
|
|
kernels[KernelInt64] = std::shared_ptr<KernelType>(clCreateKernel(program, "bitonicSort_long", &error), clReleaseKernel);
|
|
OCL::checkError(error);
|
|
|
|
kernels[KernelUInt64] = std::shared_ptr<KernelType>(clCreateKernel(program, "bitonicSort_ulong", &error), clReleaseKernel);
|
|
OCL::checkError(error);
|
|
|
|
configuration = std::shared_ptr<OCL::Configuration>(new OCL::Configuration(device, gpu_context, command_queue, program));
|
|
}
|
|
|
|
private:
|
|
/// Dictionary with kernels for each type from list: uchar, char, ushort, short, uint, int, ulong and long.
|
|
std::vector<std::shared_ptr<KernelType>> kernels;
|
|
/// Current configuration with core OpenCL instances.
|
|
std::shared_ptr<OCL::Configuration> configuration = nullptr;
|
|
|
|
cl_kernel getKernel(Int8) const { return kernels[KernelInt8].get(); }
|
|
cl_kernel getKernel(UInt8) const { return kernels[KernelUInt8].get(); }
|
|
cl_kernel getKernel(Int16) const { return kernels[KernelInt16].get(); }
|
|
cl_kernel getKernel(UInt16) const { return kernels[KernelUInt16].get(); }
|
|
cl_kernel getKernel(Int32) const { return kernels[KernelInt32].get(); }
|
|
cl_kernel getKernel(UInt32) const { return kernels[KernelUInt32].get(); }
|
|
cl_kernel getKernel(Int64) const { return kernels[KernelInt64].get(); }
|
|
cl_kernel getKernel(UInt64) const { return kernels[KernelUInt64].get(); }
|
|
|
|
/// Sorts p_input inplace with indices. Works only with arrays which size equals to power of two.
|
|
template <class T>
|
|
void sort(T * p_input, cl_uint * indices, cl_int array_size, cl_uint sort_ascending) const
|
|
{
|
|
cl_kernel kernel = getKernel(T(0));
|
|
cl_int error = CL_SUCCESS;
|
|
cl_int num_stages = 0;
|
|
|
|
for (cl_int temp = array_size; temp > 2; temp >>= 1)
|
|
num_stages++;
|
|
|
|
/// Creating OpenCL buffers using input arrays memory.
|
|
cl_mem cl_input_buffer = OCL::createBuffer<T>(p_input, array_size, configuration.get()->context());
|
|
cl_mem cl_indices_buffer = OCL::createBuffer<cl_uint>(indices, array_size, configuration.get()->context());
|
|
|
|
configureKernel<cl_mem>(kernel, 0, static_cast<void *>(&cl_input_buffer));
|
|
configureKernel<cl_mem>(kernel, 1, static_cast<void *>(&cl_indices_buffer));
|
|
configureKernel<cl_uint>(kernel, 4, static_cast<void *>(&sort_ascending));
|
|
|
|
for (cl_int stage = 0; stage < num_stages; stage++)
|
|
{
|
|
configureKernel<cl_uint>(kernel, 2, static_cast<void *>(&stage));
|
|
|
|
for (cl_int pass_of_stage = stage; pass_of_stage >= 0; pass_of_stage--)
|
|
{
|
|
configureKernel<cl_uint>(kernel, 3, static_cast<void *>(&pass_of_stage));
|
|
|
|
/// Setting work-item dimensions.
|
|
size_t gsize = array_size / (2 * 4);
|
|
size_t global_work_size[1] = {pass_of_stage ? gsize : gsize << 1 }; // number of quad items in input array
|
|
|
|
/// Executing kernel.
|
|
error = clEnqueueNDRangeKernel(configuration.get()->commandQueue(), kernel, 1, nullptr,
|
|
global_work_size, nullptr, 0, nullptr, nullptr);
|
|
OCL::checkError(error);
|
|
}
|
|
}
|
|
|
|
/// Syncs all threads.
|
|
OCL::finishCommandQueue(configuration.get()->commandQueue());
|
|
|
|
OCL::releaseData(p_input, array_size, cl_input_buffer, configuration.get()->commandQueue());
|
|
OCL::releaseData(indices, array_size, cl_indices_buffer, configuration.get()->commandQueue());
|
|
}
|
|
|
|
template <class T>
|
|
void configureKernel(cl_kernel kernel, int number_of_argument, void * source) const
|
|
{
|
|
cl_int error = clSetKernelArg(kernel, number_of_argument, sizeof(T), source);
|
|
OCL::checkError(error);
|
|
}
|
|
|
|
/// Fills given sequences from `arraySize` index with `numberOfStubs` values.
|
|
template <class T>
|
|
void fillWithStubs(T * p_input, cl_uint * indices, cl_int array_size, cl_int number_of_stubs, cl_uint sort_ascending) const
|
|
{
|
|
T value = sort_ascending ? std::numeric_limits<T>::max() : std::numeric_limits<T>::min();
|
|
for (cl_int index = 0; index < number_of_stubs; ++index)
|
|
{
|
|
p_input[array_size + index] = value;
|
|
indices[array_size + index] = array_size + index;
|
|
}
|
|
}
|
|
|
|
BitonicSort() = default;
|
|
BitonicSort(BitonicSort const &) = delete;
|
|
void operator = (BitonicSort const &) = delete;
|
|
};
|