diff --git a/CMakeLists.txt b/CMakeLists.txt index fb36aff6603..53dfd1df1cb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -385,9 +385,6 @@ if (OS_LINUX AND NOT ENABLE_JEMALLOC) endif () if (USE_OPENCL) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DUSE_OPENCL=1") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_OPENCL=1") - if (OS_DARWIN) set(OPENCL_LINKER_FLAGS "-framework OpenCL") set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OPENCL_LINKER_FLAGS}") diff --git a/cmake/find/opencl.cmake b/cmake/find/opencl.cmake index b1bf4630990..0f307350cb8 100644 --- a/cmake/find/opencl.cmake +++ b/cmake/find/opencl.cmake @@ -1,13 +1,19 @@ +# TODO: enable by default +if(0) + option(ENABLE_OPENCL "Enable OpenCL support" ${ENABLE_LIBRARIES}) +endif() + if(ENABLE_OPENCL) # Intel OpenCl driver: sudo apt install intel-opencl-icd -# TODO It's possible to add it as submodules: https://github.com/intel/compute-runtime/releases +# @sa https://github.com/intel/compute-runtime/releases # OpenCL applications should link wiht ICD loader # sudo apt install opencl-headers ocl-icd-libopencl1 # sudo ln -s /usr/lib/x86_64-linux-gnu/libOpenCL.so.1.0.0 /usr/lib/libOpenCL.so +# TODO: add https://github.com/OCL-dev/ocl-icd as submodule instead -find_package(OpenCL REQUIRED) +find_package(OpenCL) if(OpenCL_FOUND) set(USE_OPENCL 1) endif() diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp index 4958a7976de..bcb239ce083 100644 --- a/programs/server/Server.cpp +++ b/programs/server/Server.cpp @@ -226,7 +226,7 @@ int Server::main(const std::vector & /*args*/) #if !defined(ARCADIA_BUILD) #if USE_OPENCL - BitonicSort::getInstance().configure(); + BitonicSort::getInstance().configure(); #endif #endif diff --git a/src/Columns/ColumnVector.cpp b/src/Columns/ColumnVector.cpp index 42013bdffd1..74f1438de14 100644 --- a/src/Columns/ColumnVector.cpp +++ b/src/Columns/ColumnVector.cpp @@ -38,6 +38,7 @@ namespace ErrorCodes { extern const int PARAMETER_OUT_OF_BOUND; extern const int SIZES_OF_COLUMNS_DOESNT_MATCH; + extern const int OPENCL_ERROR; extern const int LOGICAL_ERROR; } @@ -120,6 +121,30 @@ namespace }; } +template +void ColumnVector::getSpecialPermutation(bool reverse, size_t limit, int nan_direction_hint, IColumn::Permutation & res, + IColumn::SpecialSort special_sort) const +{ + if (special_sort == IColumn::SpecialSort::OPENCL_BITONIC) + { +#if !defined(ARCADIA_BUILD) +#if USE_OPENCL + if (!limit || limit >= data.size()) + { + res.resize(data.size()); + + if (data.empty() || BitonicSort::getInstance().sort(data, res, !reverse)) + return; + } +#else + throw DB::Exception("'special_sort = bitonic' specified but OpenCL not available", DB::ErrorCodes::OPENCL_ERROR); +#endif +#endif + } + + getPermutation(reverse, limit, nan_direction_hint, res); +} + template void ColumnVector::getPermutation(bool reverse, size_t limit, int nan_direction_hint, IColumn::Permutation & res) const { @@ -144,14 +169,6 @@ void ColumnVector::getPermutation(bool reverse, size_t limit, int nan_directi } else { -#if !defined(ARCADIA_BUILD) -#if USE_OPENCL - /// If bitonic sort if specified as preferred than `nan_direction_hint` equals specific value 42. - if (nan_direction_hint == 42 && BitonicSort::getInstance().sort(data, res, !reverse)) - return; -#endif -#endif - /// A case for radix sort if constexpr (is_arithmetic_v && !std::is_same_v) { diff --git a/src/Columns/ColumnVector.h b/src/Columns/ColumnVector.h index 2fd177625cc..43b7c607f64 100644 --- a/src/Columns/ColumnVector.h +++ b/src/Columns/ColumnVector.h @@ -189,6 +189,8 @@ public: } void getPermutation(bool reverse, size_t limit, int nan_direction_hint, IColumn::Permutation & res) const override; + void getSpecialPermutation(bool reverse, size_t limit, int nan_direction_hint, IColumn::Permutation & res, + IColumn::SpecialSort) const override; void reserve(size_t n) override { diff --git a/src/Columns/IColumn.h b/src/Columns/IColumn.h index 496a0a5759b..2a38fd5365b 100644 --- a/src/Columns/IColumn.h +++ b/src/Columns/IColumn.h @@ -245,6 +245,17 @@ public: */ virtual void getPermutation(bool reverse, size_t limit, int nan_direction_hint, Permutation & res) const = 0; + enum class SpecialSort + { + NONE = 0, + OPENCL_BITONIC, + }; + + virtual void getSpecialPermutation(bool reverse, size_t limit, int nan_direction_hint, Permutation & res, SpecialSort) const + { + getPermutation(reverse, limit, nan_direction_hint, res); + } + /** Copies each element according offsets parameter. * (i-th element should be copied offsets[i] - offsets[i - 1] times.) * It is necessary in ARRAY JOIN operation. diff --git a/src/Common/BitonicSort.h b/src/Common/BitonicSort.h index cbe5b5dc0a4..6bf10ebe835 100644 --- a/src/Common/BitonicSort.h +++ b/src/Common/BitonicSort.h @@ -11,13 +11,6 @@ #include #endif -#include -#include -#include -#include -#include -#include - #include #include #include @@ -30,6 +23,20 @@ class BitonicSort { public: + using KernelType = OCL::KernelType; + + enum Types + { + KernelInt8 = 0, + KernelUInt8, + KernelInt16, + KernelUInt16, + KernelInt32, + KernelUInt32, + KernelInt64, + KernelUInt64, + KernelMax + }; static BitonicSort & getInstance() { @@ -39,40 +46,50 @@ public: /// Sorts given array in specified order. Returns `true` if given sequence was sorted, `false` otherwise. template - bool sort(const DB::PaddedPODArray & data, DB::IColumn::Permutation & res, cl_uint sort_ascending) + bool sort(const DB::PaddedPODArray & data, DB::IColumn::Permutation & res, cl_uint sort_ascending [[maybe_unused]]) const { - size_t s = data.size(); - - /// Getting the nearest power of 2. - size_t power = 1; - - if (s <= 8) power = 8; - else while (power < s) power <<= 1; - - /// Allocates more space for additional stubs to be added if needed. - std::vector pairs_content(power); - std::vector pairs_indices(power); - for (UInt32 i = 0; i < s; ++i) + if constexpr ( + std::is_same_v || + std::is_same_v || + std::is_same_v || + std::is_same_v || + std::is_same_v || + std::is_same_v || + std::is_same_v || + std::is_same_v) { - pairs_content[i] = data[i]; - pairs_indices[i] = i; - } + size_t data_size = data.size(); - bool result = sort(pairs_content.data(), pairs_indices.data(), s, power - s, sort_ascending); + /// Getting the nearest power of 2. + size_t power = 8; + while (power < data_size) + power <<= 1; - if (!result) return false; + /// Allocates more space for additional stubs to be added if needed. + std::vector pairs_content(power); + std::vector pairs_indices(power); - for (size_t i = 0, shift = 0; i < power; ++i) - { - if (pairs_indices[i] >= s) + memcpy(&pairs_content[0], &data[0], sizeof(T) * data_size); + for (UInt32 i = 0; i < data_size; ++i) + pairs_indices[i] = i; + + fillWithStubs(pairs_content.data(), pairs_indices.data(), data_size, power - data_size, sort_ascending); + sort(pairs_content.data(), pairs_indices.data(), power, sort_ascending); + + for (size_t i = 0, shift = 0; i < power; ++i) { - ++shift; - continue; + if (pairs_indices[i] >= data_size) + { + ++shift; + continue; + } + res[i - shift] = pairs_indices[i]; } - res[i - shift] = pairs_indices[i]; + + return true; } - return true; + return false; } /// Creating a configuration instance with making all OpenCl required variables @@ -84,29 +101,36 @@ public: cl_platform_id platform = OCL::getPlatformID(settings); cl_device_id device = OCL::getDeviceID(platform, settings); cl_context gpu_context = OCL::makeContext(device, settings); - cl_command_queue command_queue = OCL::makeCommandQueue(device, gpu_context, settings); + cl_command_queue command_queue = OCL::makeCommandQueue<2>(device, gpu_context, settings); cl_program program = OCL::makeProgram(bitonic_sort_kernels, gpu_context, device, settings); /// Creating kernels for each specified data type. cl_int error = 0; + kernels.resize(KernelMax); - kernels["char"] = std::shared_ptr(clCreateKernel(program, "bitonicSort_char", &error), - clReleaseKernel); - kernels["uchar"] = std::shared_ptr(clCreateKernel(program, "bitonicSort_uchar", &error), - clReleaseKernel); - kernels["short"] = std::shared_ptr(clCreateKernel(program, "bitonicSort_short", &error), - clReleaseKernel); - kernels["ushort"] = std::shared_ptr(clCreateKernel(program, "bitonicSort_ushort", &error), - clReleaseKernel); - kernels["int"] = std::shared_ptr(clCreateKernel(program, "bitonicSort_int", &error), - clReleaseKernel); - kernels["uint"] = std::shared_ptr(clCreateKernel(program, "bitonicSort_uint", &error), - clReleaseKernel); - kernels["long"] = std::shared_ptr(clCreateKernel(program, "bitonicSort_long", &error), - clReleaseKernel); - kernels["ulong"] = std::shared_ptr(clCreateKernel(program, "bitonicSort_ulong", &error), - clReleaseKernel); + kernels[KernelInt8] = std::shared_ptr(clCreateKernel(program, "bitonicSort_char", &error), clReleaseKernel); + OCL::checkError(error); + + kernels[KernelUInt8] = std::shared_ptr(clCreateKernel(program, "bitonicSort_uchar", &error), clReleaseKernel); + OCL::checkError(error); + + kernels[KernelInt16] = std::shared_ptr(clCreateKernel(program, "bitonicSort_short", &error), clReleaseKernel); + OCL::checkError(error); + + kernels[KernelUInt16] = std::shared_ptr(clCreateKernel(program, "bitonicSort_ushort", &error), clReleaseKernel); + OCL::checkError(error); + + kernels[KernelInt32] = std::shared_ptr(clCreateKernel(program, "bitonicSort_int", &error), clReleaseKernel); + OCL::checkError(error); + + kernels[KernelUInt32] = std::shared_ptr(clCreateKernel(program, "bitonicSort_uint", &error), clReleaseKernel); + OCL::checkError(error); + + kernels[KernelInt64] = std::shared_ptr(clCreateKernel(program, "bitonicSort_long", &error), clReleaseKernel); + OCL::checkError(error); + + kernels[KernelUInt64] = std::shared_ptr(clCreateKernel(program, "bitonicSort_ulong", &error), clReleaseKernel); OCL::checkError(error); configuration = std::shared_ptr(new OCL::Configuration(device, gpu_context, command_queue, program)); @@ -114,97 +138,24 @@ public: private: /// Dictionary with kernels for each type from list: uchar, char, ushort, short, uint, int, ulong and long. - std::map> kernels; + std::vector> kernels; /// Current configuration with core OpenCL instances. std::shared_ptr configuration = nullptr; - /// Returns `true` if given sequence was sorted, `false` otherwise. - template - bool sort(T * p_input, cl_uint * indices, cl_int array_size, cl_int number_of_stubs, cl_uint sort_ascending) - { - if (typeid(T).name() == typeid(cl_char).name()) - sort_char(reinterpret_cast(p_input), indices, array_size, number_of_stubs, sort_ascending); - else if (typeid(T) == typeid(cl_uchar)) - sort_uchar(reinterpret_cast(p_input), indices, array_size, number_of_stubs, sort_ascending); - else if (typeid(T) == typeid(cl_short)) - sort_short(reinterpret_cast(p_input), indices, array_size, number_of_stubs, sort_ascending); - else if (typeid(T) == typeid(cl_ushort)) - sort_ushort(reinterpret_cast(p_input), indices, array_size, number_of_stubs, sort_ascending); - else if (typeid(T) == typeid(cl_int)) - sort_int(reinterpret_cast(p_input), indices, array_size, number_of_stubs, sort_ascending); - else if (typeid(T) == typeid(cl_uint)) - sort_uint(reinterpret_cast(p_input), indices, array_size, number_of_stubs, sort_ascending); - else if (typeid(T) == typeid(cl_long)) - sort_long(reinterpret_cast(p_input), indices, array_size, number_of_stubs, sort_ascending); - else if (typeid(T) == typeid(cl_ulong)) - sort_ulong(reinterpret_cast(p_input), indices, array_size, number_of_stubs, sort_ascending); - else - return false; - - return true; - } - - /// Specific functions for each integer type. - void sort_char(cl_char * p_input, cl_uint * indices, cl_int array_size, cl_int number_of_stubs, cl_uint sort_ascending) - { - cl_char stubs_value = sort_ascending ? CHAR_MAX : CHAR_MIN; - fillWithStubs(number_of_stubs, stubs_value, p_input, indices, array_size); - sort(kernels["char"].get(), p_input, indices, array_size + number_of_stubs, sort_ascending); - } - - void sort_uchar(cl_uchar * p_input, cl_uint * indices, cl_int array_size, cl_int number_of_stubs, cl_uint sort_ascending) - { - cl_uchar stubs_value = sort_ascending ? UCHAR_MAX : 0; - fillWithStubs(number_of_stubs, stubs_value, p_input, indices, array_size); - sort(kernels["uchar"].get(), p_input, indices, array_size + number_of_stubs, sort_ascending); - } - - void sort_short(cl_short * p_input, cl_uint * indices, cl_int array_size, cl_int number_of_stubs, cl_uint sort_ascending) - { - cl_short stubs_value = sort_ascending ? SHRT_MAX : SHRT_MIN; - fillWithStubs(number_of_stubs, stubs_value, p_input, indices, array_size); - sort(kernels["short"].get(), p_input, indices, array_size + number_of_stubs, sort_ascending); - } - - void sort_ushort(cl_ushort * p_input, cl_uint * indices, cl_int array_size, cl_int number_of_stubs, cl_uint sort_ascending) - { - cl_ushort stubs_value = sort_ascending ? USHRT_MAX : 0; - fillWithStubs(number_of_stubs, stubs_value, p_input, indices, array_size); - sort(kernels["ushort"].get(), p_input, indices, array_size + number_of_stubs, sort_ascending); - } - - void sort_int(cl_int * p_input, cl_uint * indices, cl_int array_size, cl_int number_of_stubs, cl_uint sort_ascending) - { - cl_int stubs_value = sort_ascending ? INT_MAX : INT_MIN; - fillWithStubs(number_of_stubs, stubs_value, p_input, indices, array_size); - sort(kernels["int"].get(), p_input, indices, array_size + number_of_stubs, sort_ascending); - } - - void sort_uint(cl_uint * p_input, cl_uint * indices, cl_int array_size, cl_int number_of_stubs, cl_uint sort_ascending) - { - cl_uint stubs_value = sort_ascending ? UINT_MAX : 0; - fillWithStubs(number_of_stubs, stubs_value, p_input, indices, array_size); - sort(kernels["uint"].get(), p_input, indices, array_size + number_of_stubs, sort_ascending); - } - - void sort_long(cl_long * p_input, cl_uint * indices, cl_int array_size, cl_int number_of_stubs, cl_uint sort_ascending) - { - cl_long stubs_value = sort_ascending ? LONG_MAX : LONG_MIN; - fillWithStubs(number_of_stubs, stubs_value, p_input, indices, array_size); - sort(kernels["long"].get(), p_input, indices, array_size + number_of_stubs, sort_ascending); - } - - void sort_ulong(cl_ulong * p_input, cl_uint * indices, cl_int array_size, cl_int number_of_stubs, cl_uint sort_ascending) - { - cl_ulong stubs_value = sort_ascending ? ULONG_MAX : 0; - fillWithStubs(number_of_stubs, stubs_value, p_input, indices, array_size); - sort(kernels["ulong"].get(), p_input, indices, array_size + number_of_stubs, sort_ascending); - } + cl_kernel getKernel(Int8) const { return kernels[KernelInt8].get(); } + cl_kernel getKernel(UInt8) const { return kernels[KernelUInt8].get(); } + cl_kernel getKernel(Int16) const { return kernels[KernelInt16].get(); } + cl_kernel getKernel(UInt16) const { return kernels[KernelUInt16].get(); } + cl_kernel getKernel(Int32) const { return kernels[KernelInt32].get(); } + cl_kernel getKernel(UInt32) const { return kernels[KernelUInt32].get(); } + cl_kernel getKernel(Int64) const { return kernels[KernelInt64].get(); } + cl_kernel getKernel(UInt64) const { return kernels[KernelUInt64].get(); } /// Sorts p_input inplace with indices. Works only with arrays which size equals to power of two. template - void sort(cl_kernel kernel, T * p_input, cl_uint * indices, cl_int array_size, cl_uint sort_ascending) + void sort(T * p_input, cl_uint * indices, cl_int array_size, cl_uint sort_ascending) const { + cl_kernel kernel = getKernel(T(0)); cl_int error = CL_SUCCESS; cl_int num_stages = 0; @@ -246,7 +197,7 @@ private: } template - void configureKernel(cl_kernel kernel, int number_of_argument, void * source) + void configureKernel(cl_kernel kernel, int number_of_argument, void * source) const { cl_int error = clSetKernelArg(kernel, number_of_argument, sizeof(T), source); OCL::checkError(error); @@ -254,9 +205,9 @@ private: /// Fills given sequences from `arraySize` index with `numberOfStubs` values. template - void fillWithStubs(cl_int number_of_stubs, T value, T * p_input, - cl_uint * indices, cl_int array_size) + void fillWithStubs(T * p_input, cl_uint * indices, cl_int array_size, cl_int number_of_stubs, cl_uint sort_ascending) const { + T value = sort_ascending ? std::numeric_limits::max() : std::numeric_limits::min(); for (cl_int index = 0; index < number_of_stubs; ++index) { p_input[array_size + index] = value; @@ -264,7 +215,7 @@ private: } } - BitonicSort() {} - BitonicSort(BitonicSort const &); - void operator=(BitonicSort const &); + BitonicSort() = default; + BitonicSort(BitonicSort const &) = delete; + void operator = (BitonicSort const &) = delete; }; diff --git a/src/Common/oclBasics.h b/src/Common/oclBasics.h index 550f42a32d0..7c977830e82 100644 --- a/src/Common/oclBasics.h +++ b/src/Common/oclBasics.h @@ -17,24 +17,18 @@ #include #include -#ifndef CL_VERSION_2_0 -#define CL_USE_DEPRECATED_OPENCL_1_2_APIS -#endif - - -using KernelType = std::remove_reference::type; - namespace DB { - namespace ErrorCodes - { - extern const int OPENCL_ERROR; - } +namespace ErrorCodes +{ + extern const int OPENCL_ERROR; +} } struct OCL { + using KernelType = std::remove_reference::type; /** * Structure which represents the most essential settings of common OpenCl entities. @@ -211,7 +205,7 @@ struct OCL static void checkError(cl_int error) { if (error != CL_SUCCESS) - throw DB::Exception("OpenCL error " + opencl_error_to_str(error), DB::ErrorCodes::OPENCL_ERROR); + throw DB::Exception("OpenCL error: " + opencl_error_to_str(error), DB::ErrorCodes::OPENCL_ERROR); } @@ -223,22 +217,18 @@ struct OCL cl_int error = clGetPlatformIDs(settings.number_of_platform_entries, &platform, settings.number_of_available_platforms); checkError(error); - return platform; } - static cl_device_id getDeviceID(cl_platform_id & platform, const Settings & settings) { cl_device_id device; cl_int error = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, settings.number_of_devices_entries, &device, settings.number_of_available_devices); OCL::checkError(error); - return device; } - static cl_context makeContext(cl_device_id & device, const Settings & settings) { cl_int error; @@ -246,32 +236,43 @@ struct OCL &device, settings.context_callback, settings.context_callback_data, &error); OCL::checkError(error); - return gpu_context; } - + template static cl_command_queue makeCommandQueue(cl_device_id & device, cl_context & context, const Settings & settings [[maybe_unused]]) { cl_int error; -#ifdef CL_USE_DEPRECATED_OPENCL_1_2_APIS - cl_command_queue command_queue = clCreateCommandQueue(context, device, settings.command_queue_properties, &error); -#else - cl_command_queue command_queue = clCreateCommandQueueWithProperties(context, device, nullptr, &error); -#endif - OCL::checkError(error); + cl_command_queue command_queue; + if constexpr (version == 1) + { +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" + command_queue = clCreateCommandQueue(context, device, settings.command_queue_properties, &error); +#pragma GCC diagnostic pop + } + else + { +#ifdef CL_VERSION_2_0 + command_queue = clCreateCommandQueueWithProperties(context, device, nullptr, &error); +#else + throw DB::Exception("Binary is built with OpenCL version < 2.0", DB::ErrorCodes::OPENCL_ERROR); +#endif + } + + OCL::checkError(error); return command_queue; } - static cl_program makeProgram(const char * source_code, cl_context context, cl_device_id device_id, const Settings & settings) { cl_int error = 0; size_t source_size = strlen(source_code); - cl_program program = clCreateProgramWithSource(context, settings.number_of_program_source_pointers, &source_code, &source_size, &error); + cl_program program = clCreateProgramWithSource(context, settings.number_of_program_source_pointers, + &source_code, &source_size, &error); checkError(error); error = clBuildProgram(program, settings.number_of_devices_entries, &device_id, settings.build_options, @@ -293,39 +294,30 @@ struct OCL } checkError(error); - return program; } - /// Configuring buffer for given input data template - static cl_mem createBuffer(K * p_input, cl_int array_size, cl_context context, - cl_int elements_size = sizeof(K)) + static cl_mem createBuffer(K * p_input, cl_int array_size, cl_context context, cl_int elements_size = sizeof(K)) { cl_int error = CL_SUCCESS; - cl_mem cl_input_buffer = - clCreateBuffer - ( + cl_mem cl_input_buffer = clCreateBuffer( context, CL_MEM_USE_HOST_PTR, zeroCopySizeAlignment(elements_size * array_size), p_input, - &error - ); + &error); checkError(error); - return cl_input_buffer; } - static size_t zeroCopySizeAlignment(size_t required_size) { return required_size + (~required_size + 1) % 64; } - /// Manipulating with common OpenCL variables. static void finishCommandQueue(cl_command_queue command_queue) @@ -335,10 +327,8 @@ struct OCL OCL::checkError(error); } - template - static void releaseData(T * origin, cl_int array_size, cl_mem cl_buffer, - cl_command_queue command_queue, size_t offset = 0) + static void releaseData(T * origin, cl_int array_size, cl_mem cl_buffer, cl_command_queue command_queue, size_t offset = 0) { cl_int error = CL_SUCCESS; @@ -359,7 +349,6 @@ struct OCL error = clReleaseMemObject(cl_buffer); checkError(error); } - }; #endif diff --git a/src/Common/tests/CMakeLists.txt b/src/Common/tests/CMakeLists.txt index 1c8a39a4fa5..72c47d1ef49 100644 --- a/src/Common/tests/CMakeLists.txt +++ b/src/Common/tests/CMakeLists.txt @@ -35,10 +35,10 @@ target_link_libraries (compact_array PRIVATE clickhouse_common_io) add_executable (radix_sort radix_sort.cpp) target_link_libraries (radix_sort PRIVATE clickhouse_common_io) -# if (USE_OPENCL) -# add_executable (bitonic_sort bitonic_sort.cpp) -# target_link_libraries (bitonic_sort PRIVATE clickhouse_common_io ${OPENCL_LINKER_FLAGS}) -# endif () +if (USE_OPENCL) + add_executable (bitonic_sort bitonic_sort.cpp) + target_link_libraries (bitonic_sort PRIVATE clickhouse_common_io ${OPENCL_LINKER_FLAGS} ${OpenCL_LIBRARIES}) +endif () add_executable (arena_with_free_lists arena_with_free_lists.cpp) target_link_libraries (arena_with_free_lists PRIVATE dbms) diff --git a/src/Common/tests/bitonic_sort.cpp b/src/Common/tests/bitonic_sort.cpp index adaef94ed4c..2545662c8cb 100644 --- a/src/Common/tests/bitonic_sort.cpp +++ b/src/Common/tests/bitonic_sort.cpp @@ -1,8 +1,6 @@ #include #include -#if USE_OPENCL - #if !defined(__APPLE__) && !defined(__FreeBSD__) #include #endif @@ -16,13 +14,10 @@ #include "Common/BitonicSort.h" -using Key = cl_ulong; - - /// Generates vector of size 8 for testing. /// Vector contains max possible value, min possible value and duplicate values. template -static void generateTest(std::vector& data, Type min_value, Type max_value) +static void generateTest(std::vector & data, Type min_value, Type max_value) { int size = 10; @@ -62,8 +57,7 @@ static void check(const std::vector & indices, bool reverse = true) template -static void sortBitonicSortWithPodArrays(const std::vector& data, - std::vector & indices, bool ascending = true) +static void sortBitonicSortWithPodArrays(const std::vector & data, std::vector & indices, bool ascending = true) { DB::PaddedPODArray pod_array_data = DB::PaddedPODArray(data.size()); DB::IColumn::Permutation pod_array_indices = DB::IColumn::Permutation(data.size()); @@ -74,7 +68,6 @@ static void sortBitonicSortWithPodArrays(const std::vector& data, *(pod_array_indices.data() + index) = index; } - BitonicSort::getInstance().configure(); BitonicSort::getInstance().sort(pod_array_data, pod_array_indices, ascending); for (size_t index = 0; index < data.size(); ++index) @@ -83,7 +76,7 @@ static void sortBitonicSortWithPodArrays(const std::vector& data, template -static void testBitonicSort(std::string test_name, Type min_value, Type max_value) +static void testBitonicSort(const std::string & test_name, Type min_value, Type max_value) { std::cerr << test_name << std::endl; @@ -102,147 +95,80 @@ static void testBitonicSort(std::string test_name, Type min_value, Type max_valu static void straightforwardTests() { - testBitonicSort("Test 01: cl_char.", CHAR_MIN, CHAR_MAX); - testBitonicSort("Test 02: cl_uchar.", 0, UCHAR_MAX); - testBitonicSort("Test 03: cl_short.", SHRT_MIN, SHRT_MAX); - testBitonicSort("Test 04: cl_ushort.", 0, USHRT_MAX); - testBitonicSort("Test 05: cl_int.", INT_MIN, INT_MAX); - testBitonicSort("Test 06: cl_uint.", 0, UINT_MAX); - testBitonicSort("Test 07: cl_long.", LONG_MIN, LONG_MAX); - testBitonicSort("Test 08: cl_ulong.", 0, ULONG_MAX); + testBitonicSort("Test 01: Int8.", CHAR_MIN, CHAR_MAX); + testBitonicSort("Test 02: UInt8.", 0, UCHAR_MAX); + testBitonicSort("Test 03: Int16.", SHRT_MIN, SHRT_MAX); + testBitonicSort("Test 04: UInt16.", 0, USHRT_MAX); + testBitonicSort("Test 05: Int32.", INT_MIN, INT_MAX); + testBitonicSort("Test 06: UInt32.", 0, UINT_MAX); + testBitonicSort("Test 07: Int64.", LONG_MIN, LONG_MAX); + testBitonicSort("Test 08: UInt64.", 0, ULONG_MAX); } -static void NO_INLINE sort1(Key * data, size_t size) +template +static void bitonicSort(std::vector & data) { - std::sort(data, data + size); -} - - -static void NO_INLINE sort2(std::vector & data, std::vector & indices) -{ - BitonicSort::getInstance().configure(); + size_t size = data.size(); + std::vector indices(size); + for (size_t i = 0; i < size; ++i) + indices[i] = i; sortBitonicSortWithPodArrays(data, indices); - std::vector result(data.size()); - for (size_t index = 0; index < data.size(); ++index) - result[index] = data[indices[index]]; + std::vector result(size); + for (size_t i = 0; i < size; ++i) + result[i] = data[indices[i]]; data = std::move(result); } -int main(int argc, char ** argv) +template +static bool checkSort(const std::vector & data, size_t size) { - straightforwardTests(); + std::vector copy1(data.begin(), data.begin() + size); + std::vector copy2(data.begin(), data.begin() + size); - if (argc < 3) - { - std::cerr << "Not enough arguments were passed\n"; - return 1; - } + std::sort(copy1.data(), copy1.data() + size); + bitonicSort(copy2); - size_t n = DB::parse(argv[1]); - size_t method = DB::parse(argv[2]); + for (size_t i = 0; i < size; ++i) + if (copy1[i] != copy2[i]) + return false; - std::vector data(n); - std::vector indices(n); - - { - Stopwatch watch; - - for (auto & elem : data) - elem = static_cast(rand()); - - for (size_t i = 0; i < n; ++i) - indices[i] = i; - - watch.stop(); - double elapsed = watch.elapsedSeconds(); - std::cerr - << "Filled in " << elapsed - << " (" << n / elapsed << " elem/sec., " - << n * sizeof(Key) / elapsed / 1048576 << " MB/sec.)" - << std::endl; - } - - if (n <= 100) - { - std::cerr << std::endl; - for (const auto & elem : data) - std::cerr << elem << ' '; - std::cerr << std::endl; - for (const auto & index : indices) - std::cerr << index << ' '; - std::cerr << std::endl; - } - - { - Stopwatch watch; - - if (method == 1) sort1(data.data(), n); - if (method == 2) sort2(data, indices); - - watch.stop(); - double elapsed = watch.elapsedSeconds(); - std::cerr - << "Sorted in " << elapsed - << " (" << n / elapsed << " elem/sec., " - << n * sizeof(Key) / elapsed / 1048576 << " MB/sec.)" - << std::endl; - } - - { - Stopwatch watch; - - size_t i = 1; - while (i < n) - { - if (!(data[i - 1] <= data[i])) - break; - ++i; - } - - watch.stop(); - double elapsed = watch.elapsedSeconds(); - std::cerr - << "Checked in " << elapsed - << " (" << n / elapsed << " elem/sec., " - << n * sizeof(Key) / elapsed / 1048576 << " MB/sec.)" - << std::endl - << "Result: " << (i == n ? "Ok." : "Fail!") << std::endl; - } - - if (n <= 1000) - { - std::cerr << std::endl; - - std::cerr << data[0] << ' '; - for (size_t i = 1; i < n; ++i) - { - if (!(data[i - 1] <= data[i])) - std::cerr << "*** "; - std::cerr << data[i] << ' '; - } - - std::cerr << std::endl; - - for (const auto & index : indices) - std::cerr << index << ' '; - std::cerr << std::endl; - } - - return 0; + return true; } -#else int main() { - std::cerr << "Openc CL disabled."; + BitonicSort::getInstance().configure(); + + straightforwardTests(); + + size_t size = 1100; + std::vector data(size); + for (size_t i = 0; i < size; ++i) + data[i] = rand(); + + for (size_t i = 0; i < 128; ++i) + { + if (!checkSort(data, i)) + { + std::cerr << "fail at length " << i << std::endl; + return 1; + } + } + + for (size_t i = 128; i < size; i += 7) + { + if (!checkSort(data, i)) + { + std::cerr << "fail at length " << i << std::endl; + return 1; + } + } return 0; } - -#endif diff --git a/src/Core/config_core.h.in b/src/Core/config_core.h.in index 054ee9a80b7..620c23c21cc 100644 --- a/src/Core/config_core.h.in +++ b/src/Core/config_core.h.in @@ -8,3 +8,4 @@ #cmakedefine01 USE_EMBEDDED_COMPILER #cmakedefine01 USE_INTERNAL_LLVM_LIBRARY #cmakedefine01 USE_SSL +#cmakedefine01 USE_OPENCL diff --git a/src/Interpreters/sortBlock.cpp b/src/Interpreters/sortBlock.cpp index 0e98dc0eb4b..ec0865c2fb5 100644 --- a/src/Interpreters/sortBlock.cpp +++ b/src/Interpreters/sortBlock.cpp @@ -13,7 +13,6 @@ namespace DB namespace ErrorCodes { extern const int BAD_COLLATION; - extern const int OPENCL_ERROR; } static bool isCollationRequired(const SortColumnDescription & description) @@ -134,20 +133,12 @@ void sortBlock(Block & block, const SortDescription & description, UInt64 limit) else if (!isColumnConst(*column)) { int nan_direction_hint = description[0].nulls_direction; + auto special_sort = description[0].special_sort; - /// If in Settings `special_sort` option has been set as `bitonic_sort`, - /// then via `nan_direction_hint` variable a flag which specifies bitonic sort as preferred - /// will be passed to `getPermutation` method with value 42. - if (description[0].special_sort == SpecialSort::OPENCL_BITONIC) - { -#ifdef USE_OPENCL - nan_direction_hint = 42; -#else - throw DB::Exception("Bitonic sort specified as preferred, but OpenCL not available", DB::ErrorCodes::OPENCL_ERROR); -#endif - } - - column->getPermutation(reverse, limit, nan_direction_hint, perm); + if (special_sort == SpecialSort::OPENCL_BITONIC) + column->getSpecialPermutation(reverse, limit, nan_direction_hint, perm, IColumn::SpecialSort::OPENCL_BITONIC); + else + column->getPermutation(reverse, limit, nan_direction_hint, perm); } else /// we don't need to do anything with const column