diff --git a/CMakeLists.txt b/CMakeLists.txt
index fb36aff6603..53dfd1df1cb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -385,9 +385,6 @@ if (OS_LINUX AND NOT ENABLE_JEMALLOC)
 endif ()
 
 if (USE_OPENCL)
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DUSE_OPENCL=1")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_OPENCL=1")
-
     if (OS_DARWIN)
         set(OPENCL_LINKER_FLAGS "-framework OpenCL")
         set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OPENCL_LINKER_FLAGS}")
diff --git a/cmake/find/opencl.cmake b/cmake/find/opencl.cmake
index b1bf4630990..0f307350cb8 100644
--- a/cmake/find/opencl.cmake
+++ b/cmake/find/opencl.cmake
@@ -1,13 +1,19 @@
+# TODO: enable by default
+if(0)
+    option(ENABLE_OPENCL "Enable OpenCL support" ${ENABLE_LIBRARIES})
+endif()
+
 if(ENABLE_OPENCL)
 
 # Intel OpenCl driver: sudo apt install intel-opencl-icd
-# TODO It's possible to add it as submodules: https://github.com/intel/compute-runtime/releases
+# @sa https://github.com/intel/compute-runtime/releases
 
 # OpenCL applications should link wiht ICD loader
 # sudo apt install opencl-headers ocl-icd-libopencl1
 # sudo ln -s /usr/lib/x86_64-linux-gnu/libOpenCL.so.1.0.0 /usr/lib/libOpenCL.so
+# TODO: add https://github.com/OCL-dev/ocl-icd as submodule instead
 
-find_package(OpenCL REQUIRED)
+find_package(OpenCL)
 if(OpenCL_FOUND)
     set(USE_OPENCL 1)
 endif()
diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp
index 4958a7976de..bcb239ce083 100644
--- a/programs/server/Server.cpp
+++ b/programs/server/Server.cpp
@@ -226,7 +226,7 @@ int Server::main(const std::vector<std::string> & /*args*/)
 
 #if !defined(ARCADIA_BUILD)
 #if USE_OPENCL
-        BitonicSort::getInstance().configure();
+    BitonicSort::getInstance().configure();
 #endif
 #endif
 
diff --git a/src/Columns/ColumnVector.cpp b/src/Columns/ColumnVector.cpp
index 42013bdffd1..74f1438de14 100644
--- a/src/Columns/ColumnVector.cpp
+++ b/src/Columns/ColumnVector.cpp
@@ -38,6 +38,7 @@ namespace ErrorCodes
 {
     extern const int PARAMETER_OUT_OF_BOUND;
     extern const int SIZES_OF_COLUMNS_DOESNT_MATCH;
+    extern const int OPENCL_ERROR;
     extern const int LOGICAL_ERROR;
 }
 
@@ -120,6 +121,30 @@ namespace
     };
 }
 
+template <typename T>
+void ColumnVector<T>::getSpecialPermutation(bool reverse, size_t limit, int nan_direction_hint, IColumn::Permutation & res,
+                                            IColumn::SpecialSort special_sort) const
+{
+    if (special_sort == IColumn::SpecialSort::OPENCL_BITONIC)
+    {
+#if !defined(ARCADIA_BUILD)
+#if USE_OPENCL
+        if (!limit || limit >= data.size())
+        {
+            res.resize(data.size());
+
+            if (data.empty() || BitonicSort::getInstance().sort(data, res, !reverse))
+                return;
+        }
+#else
+        throw DB::Exception("'special_sort = bitonic' specified but OpenCL not available", DB::ErrorCodes::OPENCL_ERROR);
+#endif
+#endif
+    }
+
+    getPermutation(reverse, limit, nan_direction_hint, res);
+}
+
 template <typename T>
 void ColumnVector<T>::getPermutation(bool reverse, size_t limit, int nan_direction_hint, IColumn::Permutation & res) const
 {
@@ -144,14 +169,6 @@ void ColumnVector<T>::getPermutation(bool reverse, size_t limit, int nan_directi
     }
     else
     {
-#if !defined(ARCADIA_BUILD)
-#if USE_OPENCL
-        /// If bitonic sort if specified as preferred than `nan_direction_hint` equals specific value 42.
-        if (nan_direction_hint == 42 && BitonicSort::getInstance().sort(data, res, !reverse))
-            return;
-#endif
-#endif
-
         /// A case for radix sort
         if constexpr (is_arithmetic_v<T> && !std::is_same_v<T, UInt128>)
         {
diff --git a/src/Columns/ColumnVector.h b/src/Columns/ColumnVector.h
index 2fd177625cc..43b7c607f64 100644
--- a/src/Columns/ColumnVector.h
+++ b/src/Columns/ColumnVector.h
@@ -189,6 +189,8 @@ public:
     }
 
     void getPermutation(bool reverse, size_t limit, int nan_direction_hint, IColumn::Permutation & res) const override;
+    void getSpecialPermutation(bool reverse, size_t limit, int nan_direction_hint, IColumn::Permutation & res,
+                               IColumn::SpecialSort) const override;
 
     void reserve(size_t n) override
     {
diff --git a/src/Columns/IColumn.h b/src/Columns/IColumn.h
index 496a0a5759b..2a38fd5365b 100644
--- a/src/Columns/IColumn.h
+++ b/src/Columns/IColumn.h
@@ -245,6 +245,17 @@ public:
       */
     virtual void getPermutation(bool reverse, size_t limit, int nan_direction_hint, Permutation & res) const = 0;
 
+    enum class SpecialSort
+    {
+        NONE = 0,
+        OPENCL_BITONIC,
+    };
+
+    virtual void getSpecialPermutation(bool reverse, size_t limit, int nan_direction_hint, Permutation & res, SpecialSort) const
+    {
+        getPermutation(reverse, limit, nan_direction_hint, res);
+    }
+
     /** Copies each element according offsets parameter.
       * (i-th element should be copied offsets[i] - offsets[i - 1] times.)
       * It is necessary in ARRAY JOIN operation.
diff --git a/src/Common/BitonicSort.h b/src/Common/BitonicSort.h
index cbe5b5dc0a4..6bf10ebe835 100644
--- a/src/Common/BitonicSort.h
+++ b/src/Common/BitonicSort.h
@@ -11,13 +11,6 @@
 #include <CL/cl.h>
 #endif
 
-#include <algorithm>
-#include <cmath>
-#include <cstdlib>
-#include <cstdint>
-#include <map>
-#include <type_traits>
-
 #include <ext/bit_cast.h>
 #include <Core/Types.h>
 #include <Core/Defines.h>
@@ -30,6 +23,20 @@
 class BitonicSort
 {
 public:
+    using KernelType = OCL::KernelType;
+
+    enum Types
+    {
+        KernelInt8 = 0,
+        KernelUInt8,
+        KernelInt16,
+        KernelUInt16,
+        KernelInt32,
+        KernelUInt32,
+        KernelInt64,
+        KernelUInt64,
+        KernelMax
+    };
 
     static BitonicSort & getInstance()
     {
@@ -39,40 +46,50 @@ public:
 
     /// Sorts given array in specified order. Returns `true` if given sequence was sorted, `false` otherwise.
     template <typename T>
-    bool sort(const DB::PaddedPODArray<T> & data, DB::IColumn::Permutation & res, cl_uint sort_ascending)
+    bool sort(const DB::PaddedPODArray<T> & data, DB::IColumn::Permutation & res, cl_uint sort_ascending [[maybe_unused]]) const
     {
-        size_t s = data.size();
-
-        /// Getting the nearest power of 2.
-        size_t power = 1;
-
-        if (s <= 8) power = 8;
-        else while (power < s) power <<= 1;
-
-        /// Allocates more space for additional stubs to be added if needed.
-        std::vector<T> pairs_content(power);
-        std::vector<UInt32> pairs_indices(power);
-        for (UInt32 i = 0; i < s; ++i)
+        if constexpr (
+            std::is_same_v<T, Int8> ||
+            std::is_same_v<T, UInt8> ||
+            std::is_same_v<T, Int16> ||
+            std::is_same_v<T, UInt16> ||
+            std::is_same_v<T, Int32> ||
+            std::is_same_v<T, UInt32> ||
+            std::is_same_v<T, Int64> ||
+            std::is_same_v<T, UInt64>)
         {
-            pairs_content[i] = data[i];
-            pairs_indices[i] = i;
-        }
+            size_t data_size = data.size();
 
-        bool result = sort(pairs_content.data(), pairs_indices.data(), s, power - s, sort_ascending);
+            /// Getting the nearest power of 2.
+            size_t power = 8;
+            while (power < data_size)
+                power <<= 1;
 
-        if (!result) return false;
+            /// Allocates more space for additional stubs to be added if needed.
+            std::vector<T> pairs_content(power);
+            std::vector<UInt32> pairs_indices(power);
 
-        for (size_t i = 0, shift = 0; i < power; ++i)
-        {
-            if (pairs_indices[i] >= s)
+            memcpy(&pairs_content[0], &data[0], sizeof(T) * data_size);
+            for (UInt32 i = 0; i < data_size; ++i)
+                pairs_indices[i] = i;
+
+            fillWithStubs(pairs_content.data(), pairs_indices.data(), data_size, power - data_size, sort_ascending);
+            sort(pairs_content.data(), pairs_indices.data(), power, sort_ascending);
+
+            for (size_t i = 0, shift = 0; i < power; ++i)
             {
-                ++shift;
-                continue;
+                if (pairs_indices[i] >= data_size)
+                {
+                    ++shift;
+                    continue;
+                }
+                res[i - shift] = pairs_indices[i];
             }
-            res[i - shift] = pairs_indices[i];
+
+            return true;
         }
 
-        return true;
+        return false;
     }
 
     /// Creating a configuration instance with making all OpenCl required variables
@@ -84,29 +101,36 @@ public:
         cl_platform_id platform = OCL::getPlatformID(settings);
         cl_device_id device = OCL::getDeviceID(platform, settings);
         cl_context gpu_context = OCL::makeContext(device, settings);
-        cl_command_queue command_queue = OCL::makeCommandQueue(device, gpu_context, settings);
+        cl_command_queue command_queue = OCL::makeCommandQueue<2>(device, gpu_context, settings);
 
         cl_program program = OCL::makeProgram(bitonic_sort_kernels, gpu_context, device, settings);
 
         /// Creating kernels for each specified data type.
         cl_int error = 0;
+        kernels.resize(KernelMax);
 
-        kernels["char"] = std::shared_ptr<KernelType>(clCreateKernel(program, "bitonicSort_char", &error),
-                                                      clReleaseKernel);
-        kernels["uchar"] = std::shared_ptr<KernelType>(clCreateKernel(program, "bitonicSort_uchar", &error),
-                                                       clReleaseKernel);
-        kernels["short"] = std::shared_ptr<KernelType>(clCreateKernel(program, "bitonicSort_short", &error),
-                                                       clReleaseKernel);
-        kernels["ushort"] = std::shared_ptr<KernelType>(clCreateKernel(program, "bitonicSort_ushort", &error),
-                                                        clReleaseKernel);
-        kernels["int"] = std::shared_ptr<KernelType>(clCreateKernel(program, "bitonicSort_int", &error),
-                                                     clReleaseKernel);
-        kernels["uint"] = std::shared_ptr<KernelType>(clCreateKernel(program, "bitonicSort_uint", &error),
-                                                      clReleaseKernel);
-        kernels["long"] = std::shared_ptr<KernelType>(clCreateKernel(program, "bitonicSort_long", &error),
-                                                      clReleaseKernel);
-        kernels["ulong"] = std::shared_ptr<KernelType>(clCreateKernel(program, "bitonicSort_ulong", &error),
-                                                       clReleaseKernel);
+        kernels[KernelInt8] = std::shared_ptr<KernelType>(clCreateKernel(program, "bitonicSort_char", &error), clReleaseKernel);
+        OCL::checkError(error);
+
+        kernels[KernelUInt8] = std::shared_ptr<KernelType>(clCreateKernel(program, "bitonicSort_uchar", &error), clReleaseKernel);
+        OCL::checkError(error);
+
+        kernels[KernelInt16] = std::shared_ptr<KernelType>(clCreateKernel(program, "bitonicSort_short", &error), clReleaseKernel);
+        OCL::checkError(error);
+
+        kernels[KernelUInt16] = std::shared_ptr<KernelType>(clCreateKernel(program, "bitonicSort_ushort", &error), clReleaseKernel);
+        OCL::checkError(error);
+
+        kernels[KernelInt32] = std::shared_ptr<KernelType>(clCreateKernel(program, "bitonicSort_int", &error), clReleaseKernel);
+        OCL::checkError(error);
+
+        kernels[KernelUInt32] = std::shared_ptr<KernelType>(clCreateKernel(program, "bitonicSort_uint", &error), clReleaseKernel);
+        OCL::checkError(error);
+
+        kernels[KernelInt64] = std::shared_ptr<KernelType>(clCreateKernel(program, "bitonicSort_long", &error), clReleaseKernel);
+        OCL::checkError(error);
+
+        kernels[KernelUInt64] = std::shared_ptr<KernelType>(clCreateKernel(program, "bitonicSort_ulong", &error), clReleaseKernel);
         OCL::checkError(error);
 
         configuration = std::shared_ptr<OCL::Configuration>(new OCL::Configuration(device, gpu_context, command_queue, program));
@@ -114,97 +138,24 @@ public:
 
 private:
     /// Dictionary with kernels for each type from list: uchar, char, ushort, short, uint, int, ulong and long.
-    std::map<std::string, std::shared_ptr<KernelType>> kernels;
+    std::vector<std::shared_ptr<KernelType>> kernels;
     /// Current configuration with core OpenCL instances.
     std::shared_ptr<OCL::Configuration> configuration = nullptr;
 
-    /// Returns `true` if given sequence was sorted, `false` otherwise.
-    template <typename T>
-    bool sort(T * p_input, cl_uint * indices, cl_int array_size, cl_int number_of_stubs, cl_uint sort_ascending)
-    {
-        if (typeid(T).name() == typeid(cl_char).name())
-            sort_char(reinterpret_cast<cl_char *>(p_input), indices, array_size, number_of_stubs, sort_ascending);
-        else if (typeid(T) == typeid(cl_uchar))
-            sort_uchar(reinterpret_cast<cl_uchar *>(p_input), indices, array_size, number_of_stubs, sort_ascending);
-        else if (typeid(T) == typeid(cl_short))
-            sort_short(reinterpret_cast<cl_short *>(p_input), indices, array_size, number_of_stubs, sort_ascending);
-        else if (typeid(T) == typeid(cl_ushort))
-            sort_ushort(reinterpret_cast<cl_ushort *>(p_input), indices, array_size, number_of_stubs, sort_ascending);
-        else if (typeid(T) == typeid(cl_int))
-            sort_int(reinterpret_cast<cl_int *>(p_input), indices, array_size, number_of_stubs, sort_ascending);
-        else if (typeid(T) == typeid(cl_uint))
-            sort_uint(reinterpret_cast<cl_uint *>(p_input), indices, array_size, number_of_stubs, sort_ascending);
-        else if (typeid(T) == typeid(cl_long))
-            sort_long(reinterpret_cast<cl_long *>(p_input), indices, array_size, number_of_stubs, sort_ascending);
-        else if (typeid(T) == typeid(cl_ulong))
-            sort_ulong(reinterpret_cast<cl_ulong *>(p_input), indices, array_size, number_of_stubs, sort_ascending);
-        else
-            return false;
-
-        return true;
-    }
-
-    /// Specific functions for each integer type.
-    void sort_char(cl_char * p_input, cl_uint * indices, cl_int array_size, cl_int number_of_stubs, cl_uint sort_ascending)
-    {
-        cl_char stubs_value = sort_ascending ? CHAR_MAX : CHAR_MIN;
-        fillWithStubs(number_of_stubs, stubs_value, p_input, indices, array_size);
-        sort(kernels["char"].get(), p_input, indices, array_size + number_of_stubs, sort_ascending);
-    }
-
-    void sort_uchar(cl_uchar * p_input, cl_uint * indices, cl_int array_size, cl_int number_of_stubs, cl_uint sort_ascending)
-    {
-        cl_uchar stubs_value = sort_ascending ? UCHAR_MAX : 0;
-        fillWithStubs(number_of_stubs, stubs_value, p_input, indices, array_size);
-        sort(kernels["uchar"].get(), p_input, indices, array_size + number_of_stubs, sort_ascending);
-    }
-
-    void sort_short(cl_short * p_input, cl_uint * indices, cl_int array_size, cl_int number_of_stubs, cl_uint sort_ascending)
-    {
-        cl_short stubs_value = sort_ascending ? SHRT_MAX : SHRT_MIN;
-        fillWithStubs(number_of_stubs, stubs_value, p_input, indices, array_size);
-        sort(kernels["short"].get(), p_input, indices, array_size + number_of_stubs, sort_ascending);
-    }
-
-    void sort_ushort(cl_ushort * p_input, cl_uint * indices, cl_int array_size, cl_int number_of_stubs, cl_uint sort_ascending)
-    {
-        cl_ushort stubs_value = sort_ascending ? USHRT_MAX : 0;
-        fillWithStubs(number_of_stubs, stubs_value, p_input, indices, array_size);
-        sort(kernels["ushort"].get(), p_input, indices, array_size + number_of_stubs, sort_ascending);
-    }
-
-    void sort_int(cl_int * p_input, cl_uint * indices, cl_int array_size, cl_int number_of_stubs, cl_uint sort_ascending)
-    {
-        cl_int stubs_value = sort_ascending ? INT_MAX : INT_MIN;
-        fillWithStubs(number_of_stubs, stubs_value, p_input, indices, array_size);
-        sort(kernels["int"].get(), p_input, indices, array_size + number_of_stubs, sort_ascending);
-    }
-
-    void sort_uint(cl_uint * p_input, cl_uint * indices, cl_int array_size, cl_int number_of_stubs, cl_uint sort_ascending)
-    {
-        cl_uint stubs_value = sort_ascending ? UINT_MAX : 0;
-        fillWithStubs(number_of_stubs, stubs_value, p_input, indices, array_size);
-        sort(kernels["uint"].get(), p_input, indices, array_size + number_of_stubs, sort_ascending);
-    }
-
-    void sort_long(cl_long * p_input, cl_uint * indices, cl_int array_size, cl_int number_of_stubs, cl_uint sort_ascending)
-    {
-        cl_long stubs_value = sort_ascending ? LONG_MAX : LONG_MIN;
-        fillWithStubs(number_of_stubs, stubs_value, p_input, indices, array_size);
-        sort(kernels["long"].get(), p_input, indices, array_size + number_of_stubs, sort_ascending);
-    }
-
-    void sort_ulong(cl_ulong * p_input, cl_uint * indices, cl_int array_size, cl_int number_of_stubs, cl_uint sort_ascending)
-    {
-        cl_ulong stubs_value = sort_ascending ? ULONG_MAX : 0;
-        fillWithStubs(number_of_stubs, stubs_value, p_input, indices, array_size);
-        sort(kernels["ulong"].get(), p_input, indices, array_size + number_of_stubs, sort_ascending);
-    }
+    cl_kernel getKernel(Int8) const { return kernels[KernelInt8].get(); }
+    cl_kernel getKernel(UInt8) const { return kernels[KernelUInt8].get(); }
+    cl_kernel getKernel(Int16) const { return kernels[KernelInt16].get(); }
+    cl_kernel getKernel(UInt16) const { return kernels[KernelUInt16].get(); }
+    cl_kernel getKernel(Int32) const { return kernels[KernelInt32].get(); }
+    cl_kernel getKernel(UInt32) const { return kernels[KernelUInt32].get(); }
+    cl_kernel getKernel(Int64) const { return kernels[KernelInt64].get(); }
+    cl_kernel getKernel(UInt64) const { return kernels[KernelUInt64].get(); }
 
     /// Sorts p_input inplace with indices. Works only with arrays which size equals to power of two.
     template <class T>
-    void sort(cl_kernel kernel, T * p_input, cl_uint * indices, cl_int array_size, cl_uint sort_ascending)
+    void sort(T * p_input, cl_uint * indices, cl_int array_size, cl_uint sort_ascending) const
     {
+        cl_kernel kernel = getKernel(T(0));
         cl_int error = CL_SUCCESS;
         cl_int num_stages = 0;
 
@@ -246,7 +197,7 @@ private:
     }
 
     template <class T>
-    void configureKernel(cl_kernel kernel, int number_of_argument, void * source)
+    void configureKernel(cl_kernel kernel, int number_of_argument, void * source) const
     {
         cl_int error = clSetKernelArg(kernel, number_of_argument, sizeof(T), source);
         OCL::checkError(error);
@@ -254,9 +205,9 @@ private:
 
     /// Fills given sequences from `arraySize` index with `numberOfStubs` values.
     template <class T>
-    void fillWithStubs(cl_int number_of_stubs, T value, T * p_input,
-                       cl_uint * indices, cl_int array_size)
+    void fillWithStubs(T * p_input, cl_uint * indices, cl_int array_size, cl_int number_of_stubs, cl_uint sort_ascending) const
     {
+        T value = sort_ascending ? std::numeric_limits<T>::max() : std::numeric_limits<T>::min();
         for (cl_int index = 0; index < number_of_stubs; ++index)
         {
             p_input[array_size + index] = value;
@@ -264,7 +215,7 @@ private:
         }
     }
 
-    BitonicSort() {}
-    BitonicSort(BitonicSort const &);
-    void operator=(BitonicSort const &);
+    BitonicSort() = default;
+    BitonicSort(BitonicSort const &) = delete;
+    void operator = (BitonicSort const &) = delete;
 };
diff --git a/src/Common/oclBasics.h b/src/Common/oclBasics.h
index 550f42a32d0..7c977830e82 100644
--- a/src/Common/oclBasics.h
+++ b/src/Common/oclBasics.h
@@ -17,24 +17,18 @@
 #include <Core/Types.h>
 #include <Common/Exception.h>
 
-#ifndef CL_VERSION_2_0
-#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
-#endif
-
-
-using KernelType = std::remove_reference<decltype(*cl_kernel())>::type;
-
 
 namespace DB
 {
-    namespace ErrorCodes
-    {
-        extern const int OPENCL_ERROR;
-    }
+namespace ErrorCodes
+{
+    extern const int OPENCL_ERROR;
+}
 }
 
 struct OCL
 {
+    using KernelType = std::remove_reference<decltype(*cl_kernel())>::type;
 
     /**
      * Structure which represents the most essential settings of common OpenCl entities.
@@ -211,7 +205,7 @@ struct OCL
     static void checkError(cl_int error)
     {
         if (error != CL_SUCCESS)
-            throw DB::Exception("OpenCL error " + opencl_error_to_str(error), DB::ErrorCodes::OPENCL_ERROR);
+            throw DB::Exception("OpenCL error: " + opencl_error_to_str(error), DB::ErrorCodes::OPENCL_ERROR);
     }
 
 
@@ -223,22 +217,18 @@ struct OCL
         cl_int error = clGetPlatformIDs(settings.number_of_platform_entries, &platform,
                                         settings.number_of_available_platforms);
         checkError(error);
-
         return platform;
     }
 
-
     static cl_device_id getDeviceID(cl_platform_id & platform, const Settings & settings)
     {
         cl_device_id device;
         cl_int error = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, settings.number_of_devices_entries,
                                       &device, settings.number_of_available_devices);
         OCL::checkError(error);
-
         return device;
     }
 
-
     static cl_context makeContext(cl_device_id & device, const Settings & settings)
     {
         cl_int error;
@@ -246,32 +236,43 @@ struct OCL
                                                  &device, settings.context_callback, settings.context_callback_data,
                                                  &error);
         OCL::checkError(error);
-
         return gpu_context;
     }
 
-
+    template <int version>
     static cl_command_queue makeCommandQueue(cl_device_id & device, cl_context & context, const Settings & settings [[maybe_unused]])
     {
         cl_int error;
-#ifdef CL_USE_DEPRECATED_OPENCL_1_2_APIS
-        cl_command_queue command_queue = clCreateCommandQueue(context, device, settings.command_queue_properties, &error);
-#else
-        cl_command_queue command_queue = clCreateCommandQueueWithProperties(context, device, nullptr, &error);
-#endif
-        OCL::checkError(error);
+        cl_command_queue command_queue;
 
+        if constexpr (version == 1)
+        {
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+            command_queue = clCreateCommandQueue(context, device, settings.command_queue_properties, &error);
+#pragma GCC diagnostic pop
+        }
+        else
+        {
+#ifdef CL_VERSION_2_0
+            command_queue = clCreateCommandQueueWithProperties(context, device, nullptr, &error);
+#else
+            throw DB::Exception("Binary is built with OpenCL version < 2.0", DB::ErrorCodes::OPENCL_ERROR);
+#endif
+        }
+
+        OCL::checkError(error);
         return command_queue;
     }
 
-
     static cl_program makeProgram(const char * source_code, cl_context context,
                                   cl_device_id device_id, const Settings & settings)
     {
         cl_int error = 0;
         size_t source_size = strlen(source_code);
 
-        cl_program program = clCreateProgramWithSource(context, settings.number_of_program_source_pointers, &source_code, &source_size, &error);
+        cl_program program = clCreateProgramWithSource(context, settings.number_of_program_source_pointers,
+                                                       &source_code, &source_size, &error);
         checkError(error);
 
         error = clBuildProgram(program, settings.number_of_devices_entries, &device_id, settings.build_options,
@@ -293,39 +294,30 @@ struct OCL
         }
 
         checkError(error);
-
         return program;
     }
 
-
     /// Configuring buffer for given input data
 
     template<typename K>
-    static cl_mem createBuffer(K * p_input, cl_int array_size, cl_context context,
-                               cl_int elements_size = sizeof(K))
+    static cl_mem createBuffer(K * p_input, cl_int array_size, cl_context context, cl_int elements_size = sizeof(K))
     {
         cl_int error = CL_SUCCESS;
-        cl_mem cl_input_buffer =
-                clCreateBuffer
-                        (
+        cl_mem cl_input_buffer = clCreateBuffer(
                                 context,
                                 CL_MEM_USE_HOST_PTR,
                                 zeroCopySizeAlignment(elements_size * array_size),
                                 p_input,
-                                &error
-                        );
+                                &error);
         checkError(error);
-
         return cl_input_buffer;
     }
 
-
     static size_t zeroCopySizeAlignment(size_t required_size)
     {
         return required_size + (~required_size + 1) % 64;
     }
 
-
     /// Manipulating with common OpenCL variables.
 
     static void finishCommandQueue(cl_command_queue command_queue)
@@ -335,10 +327,8 @@ struct OCL
         OCL::checkError(error);
     }
 
-
     template<class T>
-    static void releaseData(T * origin, cl_int array_size, cl_mem cl_buffer,
-                            cl_command_queue command_queue, size_t offset = 0)
+    static void releaseData(T * origin, cl_int array_size, cl_mem cl_buffer, cl_command_queue command_queue, size_t offset = 0)
     {
         cl_int error = CL_SUCCESS;
 
@@ -359,7 +349,6 @@ struct OCL
         error = clReleaseMemObject(cl_buffer);
         checkError(error);
     }
-
 };
 
 #endif
diff --git a/src/Common/tests/CMakeLists.txt b/src/Common/tests/CMakeLists.txt
index 1c8a39a4fa5..72c47d1ef49 100644
--- a/src/Common/tests/CMakeLists.txt
+++ b/src/Common/tests/CMakeLists.txt
@@ -35,10 +35,10 @@ target_link_libraries (compact_array PRIVATE clickhouse_common_io)
 add_executable (radix_sort radix_sort.cpp)
 target_link_libraries (radix_sort PRIVATE clickhouse_common_io)
 
-# if (USE_OPENCL)
-#     add_executable (bitonic_sort bitonic_sort.cpp)
-#     target_link_libraries (bitonic_sort PRIVATE clickhouse_common_io ${OPENCL_LINKER_FLAGS})
-# endif ()
+if (USE_OPENCL)
+    add_executable (bitonic_sort bitonic_sort.cpp)
+    target_link_libraries (bitonic_sort PRIVATE clickhouse_common_io ${OPENCL_LINKER_FLAGS} ${OpenCL_LIBRARIES})
+endif ()
 
 add_executable (arena_with_free_lists arena_with_free_lists.cpp)
 target_link_libraries (arena_with_free_lists PRIVATE dbms)
diff --git a/src/Common/tests/bitonic_sort.cpp b/src/Common/tests/bitonic_sort.cpp
index adaef94ed4c..2545662c8cb 100644
--- a/src/Common/tests/bitonic_sort.cpp
+++ b/src/Common/tests/bitonic_sort.cpp
@@ -1,8 +1,6 @@
 #include <Common/config.h>
 #include <iostream>
 
-#if USE_OPENCL
-
 #if !defined(__APPLE__) && !defined(__FreeBSD__)
 #include <malloc.h>
 #endif
@@ -16,13 +14,10 @@
 #include "Common/BitonicSort.h"
 
 
-using Key = cl_ulong;
-
-
 /// Generates vector of size 8 for testing.
 /// Vector contains max possible value, min possible value and duplicate values.
 template <class Type>
-static void generateTest(std::vector<Type>& data, Type min_value, Type max_value)
+static void generateTest(std::vector<Type> & data, Type min_value, Type max_value)
 {
     int size = 10;
 
@@ -62,8 +57,7 @@ static void check(const std::vector<size_t> & indices, bool reverse = true)
 
 
 template <class Type>
-static void sortBitonicSortWithPodArrays(const std::vector<Type>& data,
-                                         std::vector<size_t> & indices, bool ascending = true)
+static void sortBitonicSortWithPodArrays(const std::vector<Type> & data, std::vector<size_t> & indices, bool ascending = true)
 {
     DB::PaddedPODArray<Type> pod_array_data = DB::PaddedPODArray<Type>(data.size());
     DB::IColumn::Permutation pod_array_indices = DB::IColumn::Permutation(data.size());
@@ -74,7 +68,6 @@ static void sortBitonicSortWithPodArrays(const std::vector<Type>& data,
         *(pod_array_indices.data() + index) = index;
     }
 
-    BitonicSort::getInstance().configure();
     BitonicSort::getInstance().sort(pod_array_data, pod_array_indices, ascending);
 
     for (size_t index = 0; index < data.size(); ++index)
@@ -83,7 +76,7 @@ static void sortBitonicSortWithPodArrays(const std::vector<Type>& data,
 
 
 template <class Type>
-static void testBitonicSort(std::string test_name, Type min_value, Type max_value)
+static void testBitonicSort(const std::string & test_name, Type min_value, Type max_value)
 {
     std::cerr << test_name << std::endl;
 
@@ -102,147 +95,80 @@ static void testBitonicSort(std::string test_name, Type min_value, Type max_valu
 
 static void straightforwardTests()
 {
-    testBitonicSort<cl_char>("Test 01: cl_char.", CHAR_MIN, CHAR_MAX);
-    testBitonicSort<cl_uchar>("Test 02: cl_uchar.", 0, UCHAR_MAX);
-    testBitonicSort<cl_short>("Test 03: cl_short.", SHRT_MIN, SHRT_MAX);
-    testBitonicSort<cl_ushort>("Test 04: cl_ushort.", 0, USHRT_MAX);
-    testBitonicSort<cl_int>("Test 05: cl_int.", INT_MIN, INT_MAX);
-    testBitonicSort<cl_uint >("Test 06: cl_uint.", 0, UINT_MAX);
-    testBitonicSort<cl_long >("Test 07: cl_long.", LONG_MIN, LONG_MAX);
-    testBitonicSort<cl_ulong >("Test 08: cl_ulong.", 0, ULONG_MAX);
+    testBitonicSort<DB::Int8>("Test 01: Int8.", CHAR_MIN, CHAR_MAX);
+    testBitonicSort<DB::UInt8>("Test 02: UInt8.", 0, UCHAR_MAX);
+    testBitonicSort<DB::Int16>("Test 03: Int16.", SHRT_MIN, SHRT_MAX);
+    testBitonicSort<DB::UInt16>("Test 04: UInt16.", 0, USHRT_MAX);
+    testBitonicSort<DB::Int32>("Test 05: Int32.", INT_MIN, INT_MAX);
+    testBitonicSort<DB::UInt32>("Test 06: UInt32.", 0, UINT_MAX);
+    testBitonicSort<DB::Int64>("Test 07: Int64.", LONG_MIN, LONG_MAX);
+    testBitonicSort<DB::UInt64>("Test 08: UInt64.", 0, ULONG_MAX);
 }
 
 
-static void NO_INLINE sort1(Key * data, size_t size)
+template <typename T>
+static void bitonicSort(std::vector<T> & data)
 {
-    std::sort(data, data + size);
-}
-
-
-static void NO_INLINE sort2(std::vector<Key> & data, std::vector<size_t> & indices)
-{
-    BitonicSort::getInstance().configure();
+    size_t size = data.size();
+    std::vector<size_t> indices(size);
+    for (size_t i = 0; i < size; ++i)
+        indices[i] = i;
 
     sortBitonicSortWithPodArrays(data, indices);
 
-    std::vector<Key> result(data.size());
-    for (size_t index = 0; index < data.size(); ++index)
-        result[index] = data[indices[index]];
+    std::vector<T> result(size);
+    for (size_t i = 0; i < size; ++i)
+        result[i] = data[indices[i]];
 
     data = std::move(result);
 }
 
 
-int main(int argc, char ** argv)
+template <typename T>
+static bool checkSort(const std::vector<T> & data, size_t size)
 {
-    straightforwardTests();
+    std::vector<T> copy1(data.begin(), data.begin() + size);
+    std::vector<T> copy2(data.begin(), data.begin() + size);
 
-    if (argc < 3)
-    {
-        std::cerr << "Not enough arguments were passed\n";
-        return 1;
-    }
+    std::sort(copy1.data(), copy1.data() + size);
+    bitonicSort<T>(copy2);
 
-    size_t n = DB::parse<size_t>(argv[1]);
-    size_t method = DB::parse<size_t>(argv[2]);
+    for (size_t i = 0; i < size; ++i)
+        if (copy1[i] != copy2[i])
+            return false;
 
-    std::vector<Key> data(n);
-    std::vector<size_t> indices(n);
-
-    {
-        Stopwatch watch;
-
-        for (auto & elem : data)
-            elem = static_cast<Key>(rand());
-
-        for (size_t i = 0; i < n; ++i)
-            indices[i] = i;
-
-        watch.stop();
-        double elapsed = watch.elapsedSeconds();
-        std::cerr
-                << "Filled in " << elapsed
-                << " (" << n / elapsed << " elem/sec., "
-                << n * sizeof(Key) / elapsed / 1048576 << " MB/sec.)"
-                << std::endl;
-    }
-
-    if (n <= 100)
-    {
-        std::cerr << std::endl;
-        for (const auto & elem : data)
-            std::cerr << elem << ' ';
-        std::cerr << std::endl;
-        for (const auto & index : indices)
-            std::cerr << index << ' ';
-        std::cerr << std::endl;
-    }
-
-    {
-        Stopwatch watch;
-
-        if (method == 1)    sort1(data.data(), n);
-        if (method == 2)    sort2(data, indices);
-
-        watch.stop();
-        double elapsed = watch.elapsedSeconds();
-        std::cerr
-                << "Sorted in " << elapsed
-                << " (" << n / elapsed << " elem/sec., "
-                << n * sizeof(Key) / elapsed / 1048576 << " MB/sec.)"
-                << std::endl;
-    }
-
-    {
-        Stopwatch watch;
-
-        size_t i = 1;
-        while (i < n)
-        {
-            if (!(data[i - 1] <= data[i]))
-                break;
-            ++i;
-        }
-
-        watch.stop();
-        double elapsed = watch.elapsedSeconds();
-        std::cerr
-                << "Checked in " << elapsed
-                << " (" << n / elapsed << " elem/sec., "
-                << n * sizeof(Key) / elapsed / 1048576 << " MB/sec.)"
-                << std::endl
-                << "Result: " << (i == n ? "Ok." : "Fail!") << std::endl;
-    }
-
-    if (n <= 1000)
-    {
-        std::cerr << std::endl;
-
-        std::cerr << data[0] << ' ';
-        for (size_t i = 1; i < n; ++i)
-        {
-            if (!(data[i - 1] <= data[i]))
-                std::cerr << "*** ";
-            std::cerr << data[i] << ' ';
-        }
-
-        std::cerr << std::endl;
-
-        for (const auto & index : indices)
-            std::cerr << index << ' ';
-        std::cerr << std::endl;
-    }
-
-    return 0;
+    return true;
 }
 
-#else
 
 int main()
 {
-    std::cerr << "Openc CL disabled.";
+    BitonicSort::getInstance().configure();
+
+    straightforwardTests();
+
+    size_t size = 1100;
+    std::vector<int> data(size);
+    for (size_t i = 0; i < size; ++i)
+        data[i] = rand();
+
+    for (size_t i = 0; i < 128; ++i)
+    {
+        if (!checkSort<int>(data, i))
+        {
+            std::cerr << "fail at length " << i << std::endl;
+            return 1;
+        }
+    }
+
+    for (size_t i = 128; i < size; i += 7)
+    {
+        if (!checkSort<int>(data, i))
+        {
+            std::cerr << "fail at length " << i << std::endl;
+            return 1;
+        }
+    }
 
     return 0;
 }
-
-#endif
diff --git a/src/Core/config_core.h.in b/src/Core/config_core.h.in
index 054ee9a80b7..620c23c21cc 100644
--- a/src/Core/config_core.h.in
+++ b/src/Core/config_core.h.in
@@ -8,3 +8,4 @@
 #cmakedefine01 USE_EMBEDDED_COMPILER
 #cmakedefine01 USE_INTERNAL_LLVM_LIBRARY
 #cmakedefine01 USE_SSL
+#cmakedefine01 USE_OPENCL
diff --git a/src/Interpreters/sortBlock.cpp b/src/Interpreters/sortBlock.cpp
index 0e98dc0eb4b..ec0865c2fb5 100644
--- a/src/Interpreters/sortBlock.cpp
+++ b/src/Interpreters/sortBlock.cpp
@@ -13,7 +13,6 @@ namespace DB
 namespace ErrorCodes
 {
     extern const int BAD_COLLATION;
-    extern const int OPENCL_ERROR;
 }
 
 static bool isCollationRequired(const SortColumnDescription & description)
@@ -134,20 +133,12 @@ void sortBlock(Block & block, const SortDescription & description, UInt64 limit)
         else if (!isColumnConst(*column))
         {
             int nan_direction_hint = description[0].nulls_direction;
+            auto special_sort = description[0].special_sort;
 
-            /// If in Settings `special_sort` option has been set as `bitonic_sort`,
-            /// then via `nan_direction_hint` variable a flag which specifies bitonic sort as preferred
-            /// will be passed to `getPermutation` method with value 42.
-            if (description[0].special_sort == SpecialSort::OPENCL_BITONIC)
-            {
-#ifdef USE_OPENCL
-                nan_direction_hint = 42;
-#else
-                throw DB::Exception("Bitonic sort specified as preferred, but OpenCL not available", DB::ErrorCodes::OPENCL_ERROR);
-#endif
-            }
-
-            column->getPermutation(reverse, limit, nan_direction_hint, perm);
+            if (special_sort == SpecialSort::OPENCL_BITONIC)
+                column->getSpecialPermutation(reverse, limit, nan_direction_hint, perm, IColumn::SpecialSort::OPENCL_BITONIC);
+            else
+                column->getPermutation(reverse, limit, nan_direction_hint, perm);
         }
         else
             /// we don't need to do anything with const column