diff --git a/src/Functions/CMakeLists.txt b/src/Functions/CMakeLists.txt index 1c3beb2e47d..7cbca175c0d 100644 --- a/src/Functions/CMakeLists.txt +++ b/src/Functions/CMakeLists.txt @@ -1,5 +1,7 @@ configure_file(config_functions.h.in ${ConfigIncludePath}/config_functions.h) +add_subdirectory(divide) + include(${ClickHouse_SOURCE_DIR}/cmake/dbms_glob_sources.cmake) add_headers_and_sources(clickhouse_functions .) @@ -25,7 +27,7 @@ target_link_libraries(clickhouse_functions PRIVATE ${ZLIB_LIBRARIES} boost::filesystem - libdivide + divide_impl ) if (OPENSSL_CRYPTO_LIBRARY) diff --git a/src/Functions/divide/CMakeLists.txt b/src/Functions/divide/CMakeLists.txt new file mode 100644 index 00000000000..e5a10f0817c --- /dev/null +++ b/src/Functions/divide/CMakeLists.txt @@ -0,0 +1,22 @@ +# A library for integer division by constant with CPU dispatching. + +if (ARCH_AMD64) + add_library(divide_impl_sse2 divideImpl.cpp) + target_compile_options(divide_impl_sse2 PRIVATE -msse2 -DNAMESPACE=SSE2) + target_link_libraries(divide_impl_sse2 libdivide) + + add_library(divide_impl_avx2 divideImpl.cpp) + target_compile_options(divide_impl_avx2 PRIVATE -mavx2 -DNAMESPACE=AVX2) + target_link_libraries(divide_impl_avx2 libdivide) + + set(IMPLEMENTATIONS divide_impl_sse2 divide_impl_avx2) +else () + add_library(divide_impl_generic divideImpl.cpp) + target_compile_options(divide_impl_generic PRIVATE -DNAMESPACE=Generic) + target_link_libraries(divide_impl_generic libdivide) + + set(IMPLEMENTATIONS divide_impl_generic) +endif () + +add_library(divide_impl divide.cpp) +target_link_libraries(divide_impl ${IMPLEMENTATIONS} clickhouse_common_io) diff --git a/src/Functions/divide/divide.cpp b/src/Functions/divide/divide.cpp new file mode 100644 index 00000000000..5ab11df2a65 --- /dev/null +++ b/src/Functions/divide/divide.cpp @@ -0,0 +1,57 @@ +#include "divide.h" +#include + +#if defined(__x86_64__) && !defined(ARCADIA_BUILD) +namespace SSE2 +{ + template + void divideImpl(const A * __restrict a_pos, B b, ResultType * __restrict c_pos, size_t size); +} + +namespace AVX2 +{ + template + void divideImpl(const A * __restrict a_pos, B b, ResultType * __restrict c_pos, size_t size); +} +#else +namespace Generic +{ + template + void divideImpl(const A * __restrict a_pos, B b, ResultType * __restrict c_pos, size_t size); +} +#endif + + +template +void divideImpl(const A * __restrict a_pos, B b, ResultType * __restrict c_pos, size_t size) +{ +#if defined(__x86_64__) && !defined(ARCADIA_BUILD) + if (DB::Cpu::CpuFlagsCache::have_AVX2) + AVX2::divideImpl(a_pos, b, c_pos, size); + else if (DB::Cpu::CpuFlagsCache::have_SSE2) + SSE2::divideImpl(a_pos, b, c_pos, size); +#else + Generic::divideImpl(a_pos, b, c_pos, size); +#endif +} + + +template void divideImpl(const uint64_t * __restrict, uint64_t, uint64_t * __restrict, size_t); +template void divideImpl(const uint64_t * __restrict, uint32_t, uint64_t * __restrict, size_t); +template void divideImpl(const uint64_t * __restrict, uint16_t, uint64_t * __restrict, size_t); +template void divideImpl(const uint64_t * __restrict, char8_t, uint64_t * __restrict, size_t); + +template void divideImpl(const uint32_t * __restrict, uint64_t, uint32_t * __restrict, size_t); +template void divideImpl(const uint32_t * __restrict, uint32_t, uint32_t * __restrict, size_t); +template void divideImpl(const uint32_t * __restrict, uint16_t, uint32_t * __restrict, size_t); +template void divideImpl(const uint32_t * __restrict, char8_t, uint32_t * __restrict, size_t); + +template void divideImpl(const int64_t * __restrict, int64_t, int64_t * __restrict, size_t); +template void divideImpl(const int64_t * __restrict, int32_t, int64_t * __restrict, size_t); +template void divideImpl(const int64_t * __restrict, int16_t, int64_t * __restrict, size_t); +template void divideImpl(const int64_t * __restrict, int8_t, int64_t * __restrict, size_t); + +template void divideImpl(const int32_t * __restrict, int64_t, int32_t * __restrict, size_t); +template void divideImpl(const int32_t * __restrict, int32_t, int32_t * __restrict, size_t); +template void divideImpl(const int32_t * __restrict, int16_t, int32_t * __restrict, size_t); +template void divideImpl(const int32_t * __restrict, int8_t, int32_t * __restrict, size_t); diff --git a/src/Functions/divide/divide.h b/src/Functions/divide/divide.h new file mode 100644 index 00000000000..1c17a461159 --- /dev/null +++ b/src/Functions/divide/divide.h @@ -0,0 +1,6 @@ +#pragma once + +#include + +template +extern void divideImpl(const A * __restrict a_pos, B b, ResultType * __restrict c_pos, size_t size); diff --git a/src/Functions/divide/divideImpl.cpp b/src/Functions/divide/divideImpl.cpp new file mode 100644 index 00000000000..f4c1a97d3ad --- /dev/null +++ b/src/Functions/divide/divideImpl.cpp @@ -0,0 +1,79 @@ +/// This translation unit should be compiled multiple times +/// with different values of NAMESPACE and machine flags (sse2, avx2). + +#if !defined(NAMESPACE) + #if defined(ARCADIA_BUILD) + #define NAMESPACE Generic + #else + #error "NAMESPACE macro must be defined" + #endif +#endif + +#if defined(__AVX2__) + #define REG_SIZE 32 + #define LIBDIVIDE_AVX2 +#elif defined(__SSE2__) + #define REG_SIZE 16 + #define LIBDIVIDE_SSE2 +#endif + +#include + + +namespace NAMESPACE +{ + +template +void divideImpl(const A * __restrict a_pos, B b, ResultType * __restrict c_pos, size_t size) +{ + libdivide::divider divider(b); + const A * a_end = a_pos + size; + +#if defined(__SSE2__) + static constexpr size_t values_per_simd_register = REG_SIZE / sizeof(A); + const A * a_end_simd = a_pos + size / values_per_simd_register * values_per_simd_register; + + while (a_pos < a_end_simd) + { +#if defined(__AVX2__) + _mm256_storeu_si256(reinterpret_cast<__m256i *>(c_pos), + _mm256_loadu_si256(reinterpret_cast(a_pos)) / divider); +#else + _mm_storeu_si128(reinterpret_cast<__m128i *>(c_pos), + _mm_loadu_si128(reinterpret_cast(a_pos)) / divider); +#endif + + a_pos += values_per_simd_register; + c_pos += values_per_simd_register; + } +#endif + + while (a_pos < a_end) + { + *c_pos = *a_pos / divider; + ++a_pos; + ++c_pos; + } +} + +template void divideImpl(const uint64_t * __restrict, uint64_t, uint64_t * __restrict, size_t); +template void divideImpl(const uint64_t * __restrict, uint32_t, uint64_t * __restrict, size_t); +template void divideImpl(const uint64_t * __restrict, uint16_t, uint64_t * __restrict, size_t); +template void divideImpl(const uint64_t * __restrict, char8_t, uint64_t * __restrict, size_t); + +template void divideImpl(const uint32_t * __restrict, uint64_t, uint32_t * __restrict, size_t); +template void divideImpl(const uint32_t * __restrict, uint32_t, uint32_t * __restrict, size_t); +template void divideImpl(const uint32_t * __restrict, uint16_t, uint32_t * __restrict, size_t); +template void divideImpl(const uint32_t * __restrict, char8_t, uint32_t * __restrict, size_t); + +template void divideImpl(const int64_t * __restrict, int64_t, int64_t * __restrict, size_t); +template void divideImpl(const int64_t * __restrict, int32_t, int64_t * __restrict, size_t); +template void divideImpl(const int64_t * __restrict, int16_t, int64_t * __restrict, size_t); +template void divideImpl(const int64_t * __restrict, int8_t, int64_t * __restrict, size_t); + +template void divideImpl(const int32_t * __restrict, int64_t, int32_t * __restrict, size_t); +template void divideImpl(const int32_t * __restrict, int32_t, int32_t * __restrict, size_t); +template void divideImpl(const int32_t * __restrict, int16_t, int32_t * __restrict, size_t); +template void divideImpl(const int32_t * __restrict, int8_t, int32_t * __restrict, size_t); + +} diff --git a/src/Functions/intDiv.cpp b/src/Functions/intDiv.cpp index 804696f2776..79e35a19283 100644 --- a/src/Functions/intDiv.cpp +++ b/src/Functions/intDiv.cpp @@ -1,11 +1,7 @@ #include #include -#if defined(__SSE2__) -# define LIBDIVIDE_SSE2 1 -#endif - -#include +#include "divide/divide.h" namespace DB @@ -70,34 +66,11 @@ struct DivideIntegralByConstantImpl if (unlikely(static_cast(b) == 0)) throw Exception("Division by zero", ErrorCodes::ILLEGAL_DIVISION); - libdivide::divider divider(b); - - const A * a_end = a_pos + size; - -#if defined(__SSE2__) - static constexpr size_t values_per_sse_register = 16 / sizeof(A); - const A * a_end_sse = a_pos + size / values_per_sse_register * values_per_sse_register; - - while (a_pos < a_end_sse) - { - _mm_storeu_si128(reinterpret_cast<__m128i *>(c_pos), - _mm_loadu_si128(reinterpret_cast(a_pos)) / divider); - - a_pos += values_per_sse_register; - c_pos += values_per_sse_register; - } -#endif - - while (a_pos < a_end) - { - *c_pos = *a_pos / divider; - ++a_pos; - ++c_pos; - } + divideImpl(a_pos, b, c_pos, size); } }; -/** Specializations are specified for dividing numbers of the type UInt64 and UInt32 by the numbers of the same sign. +/** Specializations are specified for dividing numbers of the type UInt64, UInt32, Int64, Int32 by the numbers of the same sign. * Can be expanded to all possible combinations, but more code is needed. */ diff --git a/src/Functions/ya.make b/src/Functions/ya.make index 52ed54ec64f..660f7b115bf 100644 --- a/src/Functions/ya.make +++ b/src/Functions/ya.make @@ -229,6 +229,8 @@ SRCS( defaultValueOfTypeName.cpp demange.cpp divide.cpp + divide/divide.cpp + divide/divideImpl.cpp dumpColumnStructure.cpp e.cpp empty.cpp diff --git a/tests/performance/intDiv.xml b/tests/performance/intDiv.xml new file mode 100644 index 00000000000..c6fa0238986 --- /dev/null +++ b/tests/performance/intDiv.xml @@ -0,0 +1,5 @@ + + SELECT count() FROM numbers(200000000) WHERE NOT ignore(intDiv(number, 1000000000)) + SELECT count() FROM numbers(200000000) WHERE NOT ignore(divide(number, 1000000000)) + SELECT count() FROM numbers(200000000) WHERE NOT ignore(toUInt32(divide(number, 1000000000))) +