diff --git a/src/Functions/divide/CMakeLists.txt b/src/Functions/divide/CMakeLists.txt new file mode 100644 index 00000000000..2bdd7e4c5ef --- /dev/null +++ b/src/Functions/divide/CMakeLists.txt @@ -0,0 +1,10 @@ +add_library(divide_impl_sse2 divideImpl.cpp) +target_compile_options(divide_impl_sse2 PRIVATE -msse2 -DNAMESPACE=SSE2) +target_link_libraries(divide_impl_sse2 libdivide) + +add_library(divide_impl_avx2 divideImpl.cpp) +target_compile_options(divide_impl_avx2 PRIVATE -mavx2 -DNAMESPACE=AVX2) +target_link_libraries(divide_impl_avx2 libdivide) + +add_library(divide_impl divide.cpp) +target_link_libraries(divide_impl divide_impl_sse2 divide_impl_avx2 clickhouse_common_io) diff --git a/src/Functions/divide/divide.cpp b/src/Functions/divide/divide.cpp new file mode 100644 index 00000000000..0c275dff6f6 --- /dev/null +++ b/src/Functions/divide/divide.cpp @@ -0,0 +1,66 @@ +#include "divide.h" +#include + + +namespace SSE2 +{ + template + void divideImpl(const A * __restrict a_pos, B b, ResultType * __restrict c_pos, size_t size); +} + +namespace AVX2 +{ + template + void divideImpl(const A * __restrict a_pos, B b, ResultType * __restrict c_pos, size_t size); +} + + +template +void divideImpl(const A * __restrict a_pos, B b, ResultType * __restrict c_pos, size_t size) +{ + if (DB::Cpu::CpuFlagsCache::have_AVX2) + AVX2::divideImpl(a_pos, b, c_pos, size); + else if (DB::Cpu::CpuFlagsCache::have_SSE2) + SSE2::divideImpl(a_pos, b, c_pos, size); +} + + +template void divideImpl(const uint64_t * __restrict, uint64_t, uint64_t * __restrict, size_t); +template void divideImpl(const uint64_t * __restrict, uint32_t, uint64_t * __restrict, size_t); +template void divideImpl(const uint64_t * __restrict, uint16_t, uint64_t * __restrict, size_t); +template void divideImpl(const uint64_t * __restrict, char8_t, uint64_t * __restrict, size_t); + +template void divideImpl(const uint32_t * __restrict, uint64_t, uint32_t * __restrict, size_t); +template void divideImpl(const uint32_t * __restrict, uint32_t, uint32_t * __restrict, size_t); +template void divideImpl(const uint32_t * __restrict, uint16_t, uint32_t * __restrict, size_t); +template void divideImpl(const uint32_t * __restrict, char8_t, uint32_t * __restrict, size_t); + +template void divideImpl(const int64_t * __restrict, uint64_t, int64_t * __restrict, size_t); +template void divideImpl(const int64_t * __restrict, uint32_t, int64_t * __restrict, size_t); +template void divideImpl(const int64_t * __restrict, uint16_t, int64_t * __restrict, size_t); +template void divideImpl(const int64_t * __restrict, char8_t, int64_t * __restrict, size_t); + +template void divideImpl(const int32_t * __restrict, uint64_t, int32_t * __restrict, size_t); +template void divideImpl(const int32_t * __restrict, uint32_t, int32_t * __restrict, size_t); +template void divideImpl(const int32_t * __restrict, uint16_t, int32_t * __restrict, size_t); +template void divideImpl(const int32_t * __restrict, char8_t, int32_t * __restrict, size_t); + +template void divideImpl(const uint64_t * __restrict, int64_t, uint64_t * __restrict, size_t); +template void divideImpl(const uint64_t * __restrict, int32_t, uint64_t * __restrict, size_t); +template void divideImpl(const uint64_t * __restrict, int16_t, uint64_t * __restrict, size_t); +template void divideImpl(const uint64_t * __restrict, int8_t, uint64_t * __restrict, size_t); + +template void divideImpl(const uint32_t * __restrict, int64_t, uint32_t * __restrict, size_t); +template void divideImpl(const uint32_t * __restrict, int32_t, uint32_t * __restrict, size_t); +template void divideImpl(const uint32_t * __restrict, int16_t, uint32_t * __restrict, size_t); +template void divideImpl(const uint32_t * __restrict, int8_t, uint32_t * __restrict, size_t); + +template void divideImpl(const int64_t * __restrict, int64_t, int64_t * __restrict, size_t); +template void divideImpl(const int64_t * __restrict, int32_t, int64_t * __restrict, size_t); +template void divideImpl(const int64_t * __restrict, int16_t, int64_t * __restrict, size_t); +template void divideImpl(const int64_t * __restrict, int8_t, int64_t * __restrict, size_t); + +template void divideImpl(const int32_t * __restrict, int64_t, int32_t * __restrict, size_t); +template void divideImpl(const int32_t * __restrict, int32_t, int32_t * __restrict, size_t); +template void divideImpl(const int32_t * __restrict, int16_t, int32_t * __restrict, size_t); +template void divideImpl(const int32_t * __restrict, int8_t, int32_t * __restrict, size_t); diff --git a/src/Functions/divide/divide.h b/src/Functions/divide/divide.h new file mode 100644 index 00000000000..11a5371bc31 --- /dev/null +++ b/src/Functions/divide/divide.h @@ -0,0 +1,4 @@ +#include + +template +void divideImpl(const A * __restrict a_pos, B b, ResultType * __restrict c_pos, size_t size); diff --git a/src/Functions/divide/divideImpl.cpp b/src/Functions/divide/divideImpl.cpp new file mode 100644 index 00000000000..a5c1755ab1f --- /dev/null +++ b/src/Functions/divide/divideImpl.cpp @@ -0,0 +1,95 @@ +/// This translation unit should be compiled multiple times +/// with different values of NAMESPACE and machine flags (sse2, avx2). + +#if !defined(NAMESPACE) +#error "NAMESPACE macro must be defined" +#endif + +#if defined(__AVX2__) + #define REG_SIZE 32 + #define LIBDIVIDE_AVX2 +#elif defined(__SSE2__) + #define REG_SIZE 16 + #define LIBDIVIDE_SSE2 +#endif + +#include + + +namespace NAMESPACE +{ + +template +void divideImpl(const A * __restrict a_pos, B b, ResultType * __restrict c_pos, size_t size) +{ + libdivide::divider divider(b); + const A * a_end = a_pos + size; + +#if defined(__SSE2__) + static constexpr size_t values_per_simd_register = REG_SIZE / sizeof(A); + const A * a_end_simd = a_pos + size / values_per_simd_register * values_per_simd_register; + + while (a_pos < a_end_simd) + { +#if defined(__AVX2__) + _mm256_storeu_si256(reinterpret_cast<__m256i *>(c_pos), + _mm256_loadu_si256(reinterpret_cast(a_pos)) / divider); +#else + _mm_storeu_si128(reinterpret_cast<__m128i *>(c_pos), + _mm_loadu_si128(reinterpret_cast(a_pos)) / divider); +#endif + + a_pos += values_per_simd_register; + c_pos += values_per_simd_register; + } +#endif + + while (a_pos < a_end) + { + *c_pos = *a_pos / divider; + ++a_pos; + ++c_pos; + } +} + +template void divideImpl(const uint64_t * __restrict, uint64_t, uint64_t * __restrict, size_t); +template void divideImpl(const uint64_t * __restrict, uint32_t, uint64_t * __restrict, size_t); +template void divideImpl(const uint64_t * __restrict, uint16_t, uint64_t * __restrict, size_t); +template void divideImpl(const uint64_t * __restrict, char8_t, uint64_t * __restrict, size_t); + +template void divideImpl(const uint32_t * __restrict, uint64_t, uint32_t * __restrict, size_t); +template void divideImpl(const uint32_t * __restrict, uint32_t, uint32_t * __restrict, size_t); +template void divideImpl(const uint32_t * __restrict, uint16_t, uint32_t * __restrict, size_t); +template void divideImpl(const uint32_t * __restrict, char8_t, uint32_t * __restrict, size_t); + +template void divideImpl(const int64_t * __restrict, uint64_t, int64_t * __restrict, size_t); +template void divideImpl(const int64_t * __restrict, uint32_t, int64_t * __restrict, size_t); +template void divideImpl(const int64_t * __restrict, uint16_t, int64_t * __restrict, size_t); +template void divideImpl(const int64_t * __restrict, char8_t, int64_t * __restrict, size_t); + +template void divideImpl(const int32_t * __restrict, uint64_t, int32_t * __restrict, size_t); +template void divideImpl(const int32_t * __restrict, uint32_t, int32_t * __restrict, size_t); +template void divideImpl(const int32_t * __restrict, uint16_t, int32_t * __restrict, size_t); +template void divideImpl(const int32_t * __restrict, char8_t, int32_t * __restrict, size_t); + +template void divideImpl(const uint64_t * __restrict, int64_t, uint64_t * __restrict, size_t); +template void divideImpl(const uint64_t * __restrict, int32_t, uint64_t * __restrict, size_t); +template void divideImpl(const uint64_t * __restrict, int16_t, uint64_t * __restrict, size_t); +template void divideImpl(const uint64_t * __restrict, int8_t, uint64_t * __restrict, size_t); + +template void divideImpl(const uint32_t * __restrict, int64_t, uint32_t * __restrict, size_t); +template void divideImpl(const uint32_t * __restrict, int32_t, uint32_t * __restrict, size_t); +template void divideImpl(const uint32_t * __restrict, int16_t, uint32_t * __restrict, size_t); +template void divideImpl(const uint32_t * __restrict, int8_t, uint32_t * __restrict, size_t); + +template void divideImpl(const int64_t * __restrict, int64_t, int64_t * __restrict, size_t); +template void divideImpl(const int64_t * __restrict, int32_t, int64_t * __restrict, size_t); +template void divideImpl(const int64_t * __restrict, int16_t, int64_t * __restrict, size_t); +template void divideImpl(const int64_t * __restrict, int8_t, int64_t * __restrict, size_t); + +template void divideImpl(const int32_t * __restrict, int64_t, int32_t * __restrict, size_t); +template void divideImpl(const int32_t * __restrict, int32_t, int32_t * __restrict, size_t); +template void divideImpl(const int32_t * __restrict, int16_t, int32_t * __restrict, size_t); +template void divideImpl(const int32_t * __restrict, int8_t, int32_t * __restrict, size_t); + +}