From e513e9808b0ec716454ad4df22783f9f5ae5df12 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 14 Jan 2016 00:05:11 +0300 Subject: [PATCH] dbms: porting to aarch64 [#METR-19609]. --- dbms/include/DB/Columns/ColumnVector.h | 28 ++++++++++++------- .../DB/Functions/FunctionsArithmetic.h | 8 +++++- dbms/include/DB/Functions/FunctionsString.h | 15 ++++++---- dbms/src/Columns/ColumnsCommon.cpp | 21 ++++++++------ dbms/src/DataTypes/DataTypeString.cpp | 6 +++- dbms/src/IO/ReadHelpers.cpp | 8 +++++- dbms/src/IO/tests/mempbrk.cpp | 7 +++-- dbms/src/IO/tests/parse_int_perf2.cpp | 6 +++- .../Interpreters/tests/hash_map_string_2.cpp | 23 ++++++++++++++- .../Interpreters/tests/hash_map_string_3.cpp | 16 ++++++++++- 10 files changed, 107 insertions(+), 31 deletions(-) diff --git a/dbms/include/DB/Columns/ColumnVector.h b/dbms/include/DB/Columns/ColumnVector.h index 9b81aac4451..20c86cb4599 100644 --- a/dbms/include/DB/Columns/ColumnVector.h +++ b/dbms/include/DB/Columns/ColumnVector.h @@ -10,6 +10,10 @@ #include +#if defined(__x86_64__) + #include +#endif + namespace DB { @@ -288,17 +292,20 @@ public: if (result_size_hint) res_data.reserve(result_size_hint > 0 ? result_size_hint : size); - /** Чуть более оптимизированная версия. - * Исходит из допущения, что часто куски последовательно идущих значений - * полностью проходят или полностью не проходят фильтр. - * Поэтому, будем оптимистично проверять куски по 16 значений. - */ const UInt8 * filt_pos = &filt[0]; const UInt8 * filt_end = filt_pos + size; - const UInt8 * filt_end_sse = filt_pos + size / 16 * 16; const T * data_pos = &data[0]; +#if defined(__x86_64__) + /** Чуть более оптимизированная версия. + * Исходит из допущения, что часто куски последовательно идущих значений + * полностью проходят или полностью не проходят фильтр. + * Поэтому, будем оптимистично проверять куски по SIMD_BYTES значений. + */ + + static constexpr size_t SIMD_BYTES = 16; const __m128i zero16 = _mm_setzero_si128(); + const UInt8 * filt_end_sse = filt_pos + size / SIMD_BYTES * SIMD_BYTES; while (filt_pos < filt_end_sse) { @@ -310,18 +317,19 @@ public: } else if (0xFFFF == mask) { - res_data.insert(data_pos, data_pos + 16); + res_data.insert(data_pos, data_pos + SIMD_BYTES); } else { - for (size_t i = 0; i < 16; ++i) + for (size_t i = 0; i < SIMD_BYTES; ++i) if (filt_pos[i]) res_data.push_back(data_pos[i]); } - filt_pos += 16; - data_pos += 16; + filt_pos += SIMD_BYTES; + data_pos += SIMD_BYTES; } +#endif while (filt_pos < filt_end) { diff --git a/dbms/include/DB/Functions/FunctionsArithmetic.h b/dbms/include/DB/Functions/FunctionsArithmetic.h index f3c81965fb3..6cff344c5a6 100644 --- a/dbms/include/DB/Functions/FunctionsArithmetic.h +++ b/dbms/include/DB/Functions/FunctionsArithmetic.h @@ -913,7 +913,10 @@ template <> struct FunctionUnaryArithmeticMonotonicity /// Оптимизации для целочисленного деления на константу. -#define LIBDIVIDE_USE_SSE2 1 +#if defined(__x86_64__) + #define LIBDIVIDE_USE_SSE2 1 +#endif + #include @@ -947,6 +950,8 @@ struct DivideIntegralByConstantImpl const A * a_pos = &a[0]; const A * a_end = a_pos + size; ResultType * c_pos = &c[0]; + +#if defined(__x86_64__) static constexpr size_t values_per_sse_register = 16 / sizeof(A); const A * a_end_sse = a_pos + size / values_per_sse_register * values_per_sse_register; @@ -958,6 +963,7 @@ struct DivideIntegralByConstantImpl a_pos += values_per_sse_register; c_pos += values_per_sse_register; } +#endif while (a_pos < a_end) { diff --git a/dbms/include/DB/Functions/FunctionsString.h b/dbms/include/DB/Functions/FunctionsString.h index adf83fba5d1..7052dfdc083 100644 --- a/dbms/include/DB/Functions/FunctionsString.h +++ b/dbms/include/DB/Functions/FunctionsString.h @@ -17,8 +17,10 @@ #include #include -#include -#include +#if defined(__x86_64__) + #include + #include +#endif namespace DB @@ -233,11 +235,12 @@ struct LowerUpperImpl private: static void array(const UInt8 * src, const UInt8 * src_end, UInt8 * dst) { + const auto flip_case_mask = 'A' ^ 'a'; + +#if defined(__x86_64__) const auto bytes_sse = sizeof(__m128i); const auto src_end_sse = src_end - (src_end - src) % bytes_sse; - const auto flip_case_mask = 'A' ^ 'a'; - const auto v_not_case_lower_bound = _mm_set1_epi8(not_case_lower_bound - 1); const auto v_not_case_upper_bound = _mm_set1_epi8(not_case_upper_bound + 1); const auto v_flip_case_mask = _mm_set1_epi8(flip_case_mask); @@ -260,6 +263,7 @@ private: /// store result back to destination _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), cased_chars); } +#endif for (; src < src_end; ++src, ++dst) if (*src >= not_case_lower_bound && *src <= not_case_upper_bound) @@ -394,6 +398,7 @@ private: static void array(const UInt8 * src, const UInt8 * src_end, UInt8 * dst) { +#if defined(__x86_64__) const auto bytes_sse = sizeof(__m128i); auto src_end_sse = src + (src_end - src) / bytes_sse * bytes_sse; @@ -455,7 +460,7 @@ private: } } } - +#endif /// handle remaining symbols while (src < src_end) toCase(src, src_end, dst); diff --git a/dbms/src/Columns/ColumnsCommon.cpp b/dbms/src/Columns/ColumnsCommon.cpp index 0f6085ecdb8..2beb2a9455f 100644 --- a/dbms/src/Columns/ColumnsCommon.cpp +++ b/dbms/src/Columns/ColumnsCommon.cpp @@ -1,4 +1,6 @@ -#include +#if defined(__x86_64__) + #include +#endif #include @@ -15,10 +17,11 @@ size_t countBytesInFilter(const IColumn::Filter & filt) * Лучше было бы использовать != 0, то это не позволяет SSE2. */ - const __m128i zero16 = _mm_setzero_si128(); - const Int8 * pos = reinterpret_cast(&filt[0]); const Int8 * end = pos + filt.size(); + +#if defined(__x86_64__) + const __m128i zero16 = _mm_setzero_si128(); const Int8 * end64 = pos + filt.size() / 64 * 64; for (; pos < end64; pos += 64) @@ -35,6 +38,7 @@ size_t countBytesInFilter(const IColumn::Filter & filt) | (static_cast(_mm_movemask_epi8(_mm_cmpgt_epi8( _mm_loadu_si128(reinterpret_cast(pos + 48)), zero16))) << 48)); +#endif for (; pos < end; ++pos) count += *pos > 0; @@ -71,17 +75,12 @@ void filterArraysImpl( IColumn::Offset_t current_src_offset = 0; - static constexpr size_t SIMD_BYTES = 16; - const UInt8 * filt_pos = &filt[0]; const auto filt_end = filt_pos + size; - const auto filt_end_aligned = filt_pos + size / SIMD_BYTES * SIMD_BYTES; auto offsets_pos = &src_offsets[0]; const auto offsets_begin = offsets_pos; - const __m128i zero_vec = _mm_setzero_si128(); - /// copy array ending at *end_offset_ptr const auto copy_array = [&] (const IColumn::Offset_t * offset_ptr) { @@ -96,6 +95,11 @@ void filterArraysImpl( memcpy(&res_elems[elems_size_old], &src_elems[offset], size * sizeof(T)); }; +#if defined(__x86_64__) + const __m128i zero_vec = _mm_setzero_si128(); + static constexpr size_t SIMD_BYTES = 16; + const auto filt_end_aligned = filt_pos + size / SIMD_BYTES * SIMD_BYTES; + while (filt_pos < filt_end_aligned) { const auto mask = _mm_movemask_epi8(_mm_cmpgt_epi8( @@ -149,6 +153,7 @@ void filterArraysImpl( filt_pos += SIMD_BYTES; offsets_pos += SIMD_BYTES; } +#endif while (filt_pos < filt_end) { diff --git a/dbms/src/DataTypes/DataTypeString.cpp b/dbms/src/DataTypes/DataTypeString.cpp index 57b718b3ed8..0f4b4c95a35 100644 --- a/dbms/src/DataTypes/DataTypeString.cpp +++ b/dbms/src/DataTypes/DataTypeString.cpp @@ -12,7 +12,9 @@ #include #include -#include +#if defined(__x86_64__) + #include +#endif namespace DB @@ -91,6 +93,7 @@ static NO_INLINE void deserializeBinarySSE2(ColumnString::Chars_t & data, Column if (size) { +#if defined(__x86_64__) /// Оптимистичная ветка, в которой возможно более эффективное копирование. if (offset + 16 * UNROLL_TIMES <= data.capacity() && istr.position() + size + 16 * UNROLL_TIMES <= istr.buffer().end()) { @@ -121,6 +124,7 @@ static NO_INLINE void deserializeBinarySSE2(ColumnString::Chars_t & data, Column istr.position() += size; } else +#endif { istr.readStrict(reinterpret_cast(&data[offset - size - 1]), size); } diff --git a/dbms/src/IO/ReadHelpers.cpp b/dbms/src/IO/ReadHelpers.cpp index c2448a9cdb2..2164f2be3de 100644 --- a/dbms/src/IO/ReadHelpers.cpp +++ b/dbms/src/IO/ReadHelpers.cpp @@ -1,4 +1,6 @@ -#include +#if defined(__x86_64__) + #include +#endif #include @@ -112,6 +114,7 @@ void readStringUntilEOF(String & s, ReadBuffer & buf) */ static inline const char * find_first_tab_lf_or_backslash(const char * begin, const char * end) { +#if defined(__x86_64__) static const char tab_chars[16] = {'\t', '\t', '\t', '\t', '\t', '\t', '\t', '\t', '\t', '\t', '\t', '\t', '\t', '\t', '\t', '\t'}; static const char lf_chars[16] = {'\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'}; static const char bs_chars[16] = {'\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'}; @@ -140,6 +143,7 @@ static inline const char * find_first_tab_lf_or_backslash(const char * begin, co if (bit_mask) return begin + __builtin_ctz(bit_mask); } +#endif for (; begin < end; ++begin) if (*begin == '\t' || *begin == '\n' || *begin == '\\') @@ -202,6 +206,7 @@ void readEscapedString(DB::String & s, DB::ReadBuffer & buf) template static inline const char * find_first_quote_or_backslash(const char * begin, const char * end) { +#if defined(__x86_64__) static const char quote_chars[16] = {quote, quote, quote, quote, quote, quote, quote, quote, quote, quote, quote, quote, quote, quote, quote, quote}; static const char bs_chars[16] = {'\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' }; @@ -226,6 +231,7 @@ static inline const char * find_first_quote_or_backslash(const char * begin, con if (bit_mask) return begin + __builtin_ctz(bit_mask); } +#endif for (; begin < end; ++begin) if (*begin == quote || *begin == '\\') diff --git a/dbms/src/IO/tests/mempbrk.cpp b/dbms/src/IO/tests/mempbrk.cpp index ff921fa714d..f9c356e2d02 100644 --- a/dbms/src/IO/tests/mempbrk.cpp +++ b/dbms/src/IO/tests/mempbrk.cpp @@ -1,4 +1,6 @@ -#include +#if defined(__x86_64__) + #include +#endif #include #include @@ -35,6 +37,7 @@ namespace test */ static inline const char * find_first_tab_lf_or_backslash(const char * begin, const char * end) { +#if defined(__x86_64__) static const char tab_chars[16] = {'\t', '\t', '\t', '\t', '\t', '\t', '\t', '\t', '\t', '\t', '\t', '\t', '\t', '\t', '\t', '\t'}; static const char lf_chars[16] = {'\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'}; static const char bs_chars[16] = {'\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'}; @@ -63,7 +66,7 @@ namespace test if (bit_mask) return begin + __builtin_ctz(bit_mask); } - +#endif for (; begin < end; ++begin) if (*begin == '\t' || *begin == '\n' || *begin == '\\') return begin; diff --git a/dbms/src/IO/tests/parse_int_perf2.cpp b/dbms/src/IO/tests/parse_int_perf2.cpp index 3b17b400e34..4b7db4e7fee 100644 --- a/dbms/src/IO/tests/parse_int_perf2.cpp +++ b/dbms/src/IO/tests/parse_int_perf2.cpp @@ -1,4 +1,6 @@ -#include +#if defined(__x86_64__) + #include +#endif #include #include @@ -9,6 +11,7 @@ #include +#if defined(__x86_64__) std::ostream & operator<< (std::ostream & ostr, const __m128i vec) { char digits[16]; @@ -21,6 +24,7 @@ std::ostream & operator<< (std::ostream & ostr, const __m128i vec) return ostr; } +#endif namespace test diff --git a/dbms/src/Interpreters/tests/hash_map_string_2.cpp b/dbms/src/Interpreters/tests/hash_map_string_2.cpp index ce195bc7259..4fa8e50be51 100644 --- a/dbms/src/Interpreters/tests/hash_map_string_2.cpp +++ b/dbms/src/Interpreters/tests/hash_map_string_2.cpp @@ -15,7 +15,9 @@ #include #include -#include +#if defined(__x86_64__) + #include +#endif /** Выполнять так: @@ -74,11 +76,15 @@ DefineStringRef(StringRef_Compare8_1_byUInt64) DefineStringRef(StringRef_Compare16_1_byMemcmp) DefineStringRef(StringRef_Compare16_1_byUInt64_logicAnd) DefineStringRef(StringRef_Compare16_1_byUInt64_bitAnd) + +#if defined(__x86_64__) DefineStringRef(StringRef_Compare16_1_byIntSSE) DefineStringRef(StringRef_Compare16_1_byFloatSSE) DefineStringRef(StringRef_Compare16_1_bySSE4) DefineStringRef(StringRef_Compare16_1_bySSE4_wide) DefineStringRef(StringRef_Compare16_1_bySSE_wide) +#endif + DefineStringRef(StringRef_CompareAlwaysTrue) DefineStringRef(StringRef_CompareAlmostAlwaysTrue) @@ -190,6 +196,8 @@ inline bool compare_byUInt64_bitAnd(const char * p1, const char * p2) & (reinterpret_cast(p1)[1] == reinterpret_cast(p2)[1]); } +#if defined(__x86_64__) + inline bool compare_byIntSSE(const char * p1, const char * p2) { return 0xFFFF == _mm_movemask_epi8(_mm_cmpeq_epi8( @@ -204,6 +212,8 @@ inline bool compare_byFloatSSE(const char * p1, const char * p2) _mm_loadu_ps(reinterpret_cast(p2)))); } +#endif + template inline bool memequal(const char * p1, const char * p2, size_t size) @@ -253,6 +263,8 @@ inline bool memequal(const char * p1, const char * p2, size_t size) } +#if defined(__x86_64__) + inline bool memequal_sse41(const char * p1, const char * p2, size_t size) { // const char * p1_end = p1 + size; @@ -483,6 +495,8 @@ inline bool memequal_sse_wide(const char * p1, const char * p2, size_t size) return true; } +#endif + #define Op(METHOD) \ inline bool operator==(StringRef_Compare16_1_ ## METHOD lhs, StringRef_Compare16_1_ ## METHOD rhs) \ @@ -499,6 +513,9 @@ inline bool operator==(StringRef_Compare16_1_ ## METHOD lhs, StringRef_Compare16 Op(byMemcmp) Op(byUInt64_logicAnd) Op(byUInt64_bitAnd) + +#if defined(__x86_64__) + Op(byIntSSE) Op(byFloatSSE) @@ -536,6 +553,8 @@ inline bool operator==(StringRef_Compare16_1_bySSE_wide lhs, StringRef_Compare16 return memequal_sse_wide(lhs.data, rhs.data, lhs.size); } +#endif + inline bool operator==(StringRef_CompareAlwaysTrue lhs, StringRef_CompareAlwaysTrue rhs) { @@ -623,11 +642,13 @@ int main(int argc, char ** argv) if (!m || m == 5) bench (data, "StringRef_Compare16_1_byMemcmp"); if (!m || m == 6) bench(data, "StringRef_Compare16_1_byUInt64_logicAnd"); if (!m || m == 7) bench (data, "StringRef_Compare16_1_byUInt64_bitAnd"); +#if defined(__x86_64__) if (!m || m == 8) bench (data, "StringRef_Compare16_1_byIntSSE"); if (!m || m == 9) bench (data, "StringRef_Compare16_1_byFloatSSE"); if (!m || m == 10) bench (data, "StringRef_Compare16_1_bySSE4"); if (!m || m == 11) bench (data, "StringRef_Compare16_1_bySSE4_wide"); if (!m || m == 12) bench (data, "StringRef_Compare16_1_bySSE_wide"); +#endif if (!m || m == 100) bench (data, "StringRef_CompareAlwaysTrue"); if (!m || m == 101) bench (data, "StringRef_CompareAlmostAlwaysTrue"); diff --git a/dbms/src/Interpreters/tests/hash_map_string_3.cpp b/dbms/src/Interpreters/tests/hash_map_string_3.cpp index ccc7b9f9b33..c6fb9b6ea6a 100644 --- a/dbms/src/Interpreters/tests/hash_map_string_3.cpp +++ b/dbms/src/Interpreters/tests/hash_map_string_3.cpp @@ -18,7 +18,9 @@ #include #include -#include +#if defined(__x86_64__) + #include +#endif /** Выполнять так: @@ -137,6 +139,8 @@ struct FastHash64 }; +#if defined(__x86_64__) + struct CrapWow { size_t operator() (StringRef x) const @@ -206,6 +210,8 @@ struct CrapWow } }; +#endif + struct SimpleHash { @@ -306,6 +312,8 @@ struct MetroHash64 }; +#if defined(__x86_64__) + /*struct CRC32Hash { size_t operator() (StringRef x) const @@ -383,6 +391,8 @@ struct CRC32ILPHash } }; +#endif + typedef UInt64 Value; @@ -451,9 +461,13 @@ int main(int argc, char ** argv) if (!m || m == 1) bench>(data, "StringRef_CityHash64"); if (!m || m == 2) bench (data, "StringRef_FastHash64"); if (!m || m == 3) bench (data, "StringRef_SimpleHash"); + +#if defined(__x86_64__) if (!m || m == 4) bench (data, "StringRef_CrapWow"); if (!m || m == 5) bench (data, "StringRef_CRC32Hash"); if (!m || m == 6) bench (data, "StringRef_CRC32ILPHash"); +#endif + if (!m || m == 7) bench(data, "StringRef_VerySimpleHash"); if (!m || m == 8) bench(data, "StringRef_FarmHash64"); if (!m || m == 9) bench>(data, "StringRef_MetroHash64_1");