dbms: porting to aarch64 [#METR-19609].

This commit is contained in:
Alexey Milovidov 2016-01-14 00:05:11 +03:00
parent fefce00f5d
commit e513e9808b
10 changed files with 107 additions and 31 deletions

View File

@ -10,6 +10,10 @@
#include <DB/Columns/IColumn.h>
#if defined(__x86_64__)
#include <emmintrin.h>
#endif
namespace DB
{
@ -288,17 +292,20 @@ public:
if (result_size_hint)
res_data.reserve(result_size_hint > 0 ? result_size_hint : size);
/** Чуть более оптимизированная версия.
* Исходит из допущения, что часто куски последовательно идущих значений
* полностью проходят или полностью не проходят фильтр.
* Поэтому, будем оптимистично проверять куски по 16 значений.
*/
const UInt8 * filt_pos = &filt[0];
const UInt8 * filt_end = filt_pos + size;
const UInt8 * filt_end_sse = filt_pos + size / 16 * 16;
const T * data_pos = &data[0];
#if defined(__x86_64__)
/** Чуть более оптимизированная версия.
* Исходит из допущения, что часто куски последовательно идущих значений
* полностью проходят или полностью не проходят фильтр.
* Поэтому, будем оптимистично проверять куски по SIMD_BYTES значений.
*/
static constexpr size_t SIMD_BYTES = 16;
const __m128i zero16 = _mm_setzero_si128();
const UInt8 * filt_end_sse = filt_pos + size / SIMD_BYTES * SIMD_BYTES;
while (filt_pos < filt_end_sse)
{
@ -310,18 +317,19 @@ public:
}
else if (0xFFFF == mask)
{
res_data.insert(data_pos, data_pos + 16);
res_data.insert(data_pos, data_pos + SIMD_BYTES);
}
else
{
for (size_t i = 0; i < 16; ++i)
for (size_t i = 0; i < SIMD_BYTES; ++i)
if (filt_pos[i])
res_data.push_back(data_pos[i]);
}
filt_pos += 16;
data_pos += 16;
filt_pos += SIMD_BYTES;
data_pos += SIMD_BYTES;
}
#endif
while (filt_pos < filt_end)
{

View File

@ -913,7 +913,10 @@ template <> struct FunctionUnaryArithmeticMonotonicity<NameBitNot>
/// Оптимизации для целочисленного деления на константу.
#define LIBDIVIDE_USE_SSE2 1
#if defined(__x86_64__)
#define LIBDIVIDE_USE_SSE2 1
#endif
#include <libdivide.h>
@ -947,6 +950,8 @@ struct DivideIntegralByConstantImpl
const A * a_pos = &a[0];
const A * a_end = a_pos + size;
ResultType * c_pos = &c[0];
#if defined(__x86_64__)
static constexpr size_t values_per_sse_register = 16 / sizeof(A);
const A * a_end_sse = a_pos + size / values_per_sse_register * values_per_sse_register;
@ -958,6 +963,7 @@ struct DivideIntegralByConstantImpl
a_pos += values_per_sse_register;
c_pos += values_per_sse_register;
}
#endif
while (a_pos < a_end)
{

View File

@ -17,8 +17,10 @@
#include <DB/Functions/IFunction.h>
#include <ext/range.hpp>
#include <emmintrin.h>
#include <nmmintrin.h>
#if defined(__x86_64__)
#include <emmintrin.h>
#include <nmmintrin.h>
#endif
namespace DB
@ -233,11 +235,12 @@ struct LowerUpperImpl
private:
static void array(const UInt8 * src, const UInt8 * src_end, UInt8 * dst)
{
const auto flip_case_mask = 'A' ^ 'a';
#if defined(__x86_64__)
const auto bytes_sse = sizeof(__m128i);
const auto src_end_sse = src_end - (src_end - src) % bytes_sse;
const auto flip_case_mask = 'A' ^ 'a';
const auto v_not_case_lower_bound = _mm_set1_epi8(not_case_lower_bound - 1);
const auto v_not_case_upper_bound = _mm_set1_epi8(not_case_upper_bound + 1);
const auto v_flip_case_mask = _mm_set1_epi8(flip_case_mask);
@ -260,6 +263,7 @@ private:
/// store result back to destination
_mm_storeu_si128(reinterpret_cast<__m128i *>(dst), cased_chars);
}
#endif
for (; src < src_end; ++src, ++dst)
if (*src >= not_case_lower_bound && *src <= not_case_upper_bound)
@ -394,6 +398,7 @@ private:
static void array(const UInt8 * src, const UInt8 * src_end, UInt8 * dst)
{
#if defined(__x86_64__)
const auto bytes_sse = sizeof(__m128i);
auto src_end_sse = src + (src_end - src) / bytes_sse * bytes_sse;
@ -455,7 +460,7 @@ private:
}
}
}
#endif
/// handle remaining symbols
while (src < src_end)
toCase(src, src_end, dst);

View File

@ -1,4 +1,6 @@
#include <emmintrin.h>
#if defined(__x86_64__)
#include <emmintrin.h>
#endif
#include <DB/Columns/IColumn.h>
@ -15,10 +17,11 @@ size_t countBytesInFilter(const IColumn::Filter & filt)
* Лучше было бы использовать != 0, то это не позволяет SSE2.
*/
const __m128i zero16 = _mm_setzero_si128();
const Int8 * pos = reinterpret_cast<const Int8 *>(&filt[0]);
const Int8 * end = pos + filt.size();
#if defined(__x86_64__)
const __m128i zero16 = _mm_setzero_si128();
const Int8 * end64 = pos + filt.size() / 64 * 64;
for (; pos < end64; pos += 64)
@ -35,6 +38,7 @@ size_t countBytesInFilter(const IColumn::Filter & filt)
| (static_cast<UInt64>(_mm_movemask_epi8(_mm_cmpgt_epi8(
_mm_loadu_si128(reinterpret_cast<const __m128i *>(pos + 48)),
zero16))) << 48));
#endif
for (; pos < end; ++pos)
count += *pos > 0;
@ -71,17 +75,12 @@ void filterArraysImpl(
IColumn::Offset_t current_src_offset = 0;
static constexpr size_t SIMD_BYTES = 16;
const UInt8 * filt_pos = &filt[0];
const auto filt_end = filt_pos + size;
const auto filt_end_aligned = filt_pos + size / SIMD_BYTES * SIMD_BYTES;
auto offsets_pos = &src_offsets[0];
const auto offsets_begin = offsets_pos;
const __m128i zero_vec = _mm_setzero_si128();
/// copy array ending at *end_offset_ptr
const auto copy_array = [&] (const IColumn::Offset_t * offset_ptr)
{
@ -96,6 +95,11 @@ void filterArraysImpl(
memcpy(&res_elems[elems_size_old], &src_elems[offset], size * sizeof(T));
};
#if defined(__x86_64__)
const __m128i zero_vec = _mm_setzero_si128();
static constexpr size_t SIMD_BYTES = 16;
const auto filt_end_aligned = filt_pos + size / SIMD_BYTES * SIMD_BYTES;
while (filt_pos < filt_end_aligned)
{
const auto mask = _mm_movemask_epi8(_mm_cmpgt_epi8(
@ -149,6 +153,7 @@ void filterArraysImpl(
filt_pos += SIMD_BYTES;
offsets_pos += SIMD_BYTES;
}
#endif
while (filt_pos < filt_end)
{

View File

@ -12,7 +12,9 @@
#include <DB/IO/WriteHelpers.h>
#include <DB/IO/VarInt.h>
#include <emmintrin.h>
#if defined(__x86_64__)
#include <emmintrin.h>
#endif
namespace DB
@ -91,6 +93,7 @@ static NO_INLINE void deserializeBinarySSE2(ColumnString::Chars_t & data, Column
if (size)
{
#if defined(__x86_64__)
/// Оптимистичная ветка, в которой возможно более эффективное копирование.
if (offset + 16 * UNROLL_TIMES <= data.capacity() && istr.position() + size + 16 * UNROLL_TIMES <= istr.buffer().end())
{
@ -121,6 +124,7 @@ static NO_INLINE void deserializeBinarySSE2(ColumnString::Chars_t & data, Column
istr.position() += size;
}
else
#endif
{
istr.readStrict(reinterpret_cast<char*>(&data[offset - size - 1]), size);
}

View File

@ -1,4 +1,6 @@
#include <emmintrin.h>
#if defined(__x86_64__)
#include <emmintrin.h>
#endif
#include <sstream>
@ -112,6 +114,7 @@ void readStringUntilEOF(String & s, ReadBuffer & buf)
*/
static inline const char * find_first_tab_lf_or_backslash(const char * begin, const char * end)
{
#if defined(__x86_64__)
static const char tab_chars[16] = {'\t', '\t', '\t', '\t', '\t', '\t', '\t', '\t', '\t', '\t', '\t', '\t', '\t', '\t', '\t', '\t'};
static const char lf_chars[16] = {'\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'};
static const char bs_chars[16] = {'\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'};
@ -140,6 +143,7 @@ static inline const char * find_first_tab_lf_or_backslash(const char * begin, co
if (bit_mask)
return begin + __builtin_ctz(bit_mask);
}
#endif
for (; begin < end; ++begin)
if (*begin == '\t' || *begin == '\n' || *begin == '\\')
@ -202,6 +206,7 @@ void readEscapedString(DB::String & s, DB::ReadBuffer & buf)
template <char quote>
static inline const char * find_first_quote_or_backslash(const char * begin, const char * end)
{
#if defined(__x86_64__)
static const char quote_chars[16] = {quote, quote, quote, quote, quote, quote, quote, quote, quote, quote, quote, quote, quote, quote, quote, quote};
static const char bs_chars[16] = {'\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' };
@ -226,6 +231,7 @@ static inline const char * find_first_quote_or_backslash(const char * begin, con
if (bit_mask)
return begin + __builtin_ctz(bit_mask);
}
#endif
for (; begin < end; ++begin)
if (*begin == quote || *begin == '\\')

View File

@ -1,4 +1,6 @@
#include <emmintrin.h>
#if defined(__x86_64__)
#include <emmintrin.h>
#endif
#include <string>
#include <iostream>
@ -35,6 +37,7 @@ namespace test
*/
static inline const char * find_first_tab_lf_or_backslash(const char * begin, const char * end)
{
#if defined(__x86_64__)
static const char tab_chars[16] = {'\t', '\t', '\t', '\t', '\t', '\t', '\t', '\t', '\t', '\t', '\t', '\t', '\t', '\t', '\t', '\t'};
static const char lf_chars[16] = {'\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'};
static const char bs_chars[16] = {'\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'};
@ -63,7 +66,7 @@ namespace test
if (bit_mask)
return begin + __builtin_ctz(bit_mask);
}
#endif
for (; begin < end; ++begin)
if (*begin == '\t' || *begin == '\n' || *begin == '\\')
return begin;

View File

@ -1,4 +1,6 @@
#include <emmintrin.h>
#if defined(__x86_64__)
#include <emmintrin.h>
#endif
#include <iostream>
#include <iomanip>
@ -9,6 +11,7 @@
#include <DB/Common/Stopwatch.h>
#if defined(__x86_64__)
std::ostream & operator<< (std::ostream & ostr, const __m128i vec)
{
char digits[16];
@ -21,6 +24,7 @@ std::ostream & operator<< (std::ostream & ostr, const __m128i vec)
return ostr;
}
#endif
namespace test

View File

@ -15,7 +15,9 @@
#include <DB/Common/HashTable/HashMap.h>
#include <DB/Interpreters/AggregationCommon.h>
#include <smmintrin.h>
#if defined(__x86_64__)
#include <smmintrin.h>
#endif
/** Выполнять так:
@ -74,11 +76,15 @@ DefineStringRef(StringRef_Compare8_1_byUInt64)
DefineStringRef(StringRef_Compare16_1_byMemcmp)
DefineStringRef(StringRef_Compare16_1_byUInt64_logicAnd)
DefineStringRef(StringRef_Compare16_1_byUInt64_bitAnd)
#if defined(__x86_64__)
DefineStringRef(StringRef_Compare16_1_byIntSSE)
DefineStringRef(StringRef_Compare16_1_byFloatSSE)
DefineStringRef(StringRef_Compare16_1_bySSE4)
DefineStringRef(StringRef_Compare16_1_bySSE4_wide)
DefineStringRef(StringRef_Compare16_1_bySSE_wide)
#endif
DefineStringRef(StringRef_CompareAlwaysTrue)
DefineStringRef(StringRef_CompareAlmostAlwaysTrue)
@ -190,6 +196,8 @@ inline bool compare_byUInt64_bitAnd(const char * p1, const char * p2)
& (reinterpret_cast<const uint64_t *>(p1)[1] == reinterpret_cast<const uint64_t *>(p2)[1]);
}
#if defined(__x86_64__)
inline bool compare_byIntSSE(const char * p1, const char * p2)
{
return 0xFFFF == _mm_movemask_epi8(_mm_cmpeq_epi8(
@ -204,6 +212,8 @@ inline bool compare_byFloatSSE(const char * p1, const char * p2)
_mm_loadu_ps(reinterpret_cast<const float *>(p2))));
}
#endif
template <bool compare(const char *, const char *)>
inline bool memequal(const char * p1, const char * p2, size_t size)
@ -253,6 +263,8 @@ inline bool memequal(const char * p1, const char * p2, size_t size)
}
#if defined(__x86_64__)
inline bool memequal_sse41(const char * p1, const char * p2, size_t size)
{
// const char * p1_end = p1 + size;
@ -483,6 +495,8 @@ inline bool memequal_sse_wide(const char * p1, const char * p2, size_t size)
return true;
}
#endif
#define Op(METHOD) \
inline bool operator==(StringRef_Compare16_1_ ## METHOD lhs, StringRef_Compare16_1_ ## METHOD rhs) \
@ -499,6 +513,9 @@ inline bool operator==(StringRef_Compare16_1_ ## METHOD lhs, StringRef_Compare16
Op(byMemcmp)
Op(byUInt64_logicAnd)
Op(byUInt64_bitAnd)
#if defined(__x86_64__)
Op(byIntSSE)
Op(byFloatSSE)
@ -536,6 +553,8 @@ inline bool operator==(StringRef_Compare16_1_bySSE_wide lhs, StringRef_Compare16
return memequal_sse_wide(lhs.data, rhs.data, lhs.size);
}
#endif
inline bool operator==(StringRef_CompareAlwaysTrue lhs, StringRef_CompareAlwaysTrue rhs)
{
@ -623,11 +642,13 @@ int main(int argc, char ** argv)
if (!m || m == 5) bench<StringRef_Compare16_1_byMemcmp> (data, "StringRef_Compare16_1_byMemcmp");
if (!m || m == 6) bench<StringRef_Compare16_1_byUInt64_logicAnd>(data, "StringRef_Compare16_1_byUInt64_logicAnd");
if (!m || m == 7) bench<StringRef_Compare16_1_byUInt64_bitAnd> (data, "StringRef_Compare16_1_byUInt64_bitAnd");
#if defined(__x86_64__)
if (!m || m == 8) bench<StringRef_Compare16_1_byIntSSE> (data, "StringRef_Compare16_1_byIntSSE");
if (!m || m == 9) bench<StringRef_Compare16_1_byFloatSSE> (data, "StringRef_Compare16_1_byFloatSSE");
if (!m || m == 10) bench<StringRef_Compare16_1_bySSE4> (data, "StringRef_Compare16_1_bySSE4");
if (!m || m == 11) bench<StringRef_Compare16_1_bySSE4_wide> (data, "StringRef_Compare16_1_bySSE4_wide");
if (!m || m == 12) bench<StringRef_Compare16_1_bySSE_wide> (data, "StringRef_Compare16_1_bySSE_wide");
#endif
if (!m || m == 100) bench<StringRef_CompareAlwaysTrue> (data, "StringRef_CompareAlwaysTrue");
if (!m || m == 101) bench<StringRef_CompareAlmostAlwaysTrue> (data, "StringRef_CompareAlmostAlwaysTrue");

View File

@ -18,7 +18,9 @@
#include <DB/Common/HashTable/HashMap.h>
#include <DB/Interpreters/AggregationCommon.h>
#include <smmintrin.h>
#if defined(__x86_64__)
#include <smmintrin.h>
#endif
/** Выполнять так:
@ -137,6 +139,8 @@ struct FastHash64
};
#if defined(__x86_64__)
struct CrapWow
{
size_t operator() (StringRef x) const
@ -206,6 +210,8 @@ struct CrapWow
}
};
#endif
struct SimpleHash
{
@ -306,6 +312,8 @@ struct MetroHash64
};
#if defined(__x86_64__)
/*struct CRC32Hash
{
size_t operator() (StringRef x) const
@ -383,6 +391,8 @@ struct CRC32ILPHash
}
};
#endif
typedef UInt64 Value;
@ -451,9 +461,13 @@ int main(int argc, char ** argv)
if (!m || m == 1) bench<StringRef_CompareMemcmp, DefaultHash<StringRef>>(data, "StringRef_CityHash64");
if (!m || m == 2) bench<StringRef_CompareMemcmp, FastHash64> (data, "StringRef_FastHash64");
if (!m || m == 3) bench<StringRef_CompareMemcmp, SimpleHash> (data, "StringRef_SimpleHash");
#if defined(__x86_64__)
if (!m || m == 4) bench<StringRef_CompareMemcmp, CrapWow> (data, "StringRef_CrapWow");
if (!m || m == 5) bench<StringRef_CompareMemcmp, CRC32Hash> (data, "StringRef_CRC32Hash");
if (!m || m == 6) bench<StringRef_CompareMemcmp, CRC32ILPHash> (data, "StringRef_CRC32ILPHash");
#endif
if (!m || m == 7) bench<StringRef_CompareMemcmp, VerySimpleHash>(data, "StringRef_VerySimpleHash");
if (!m || m == 8) bench<StringRef_CompareMemcmp, FarmHash64>(data, "StringRef_FarmHash64");
if (!m || m == 9) bench<StringRef_CompareMemcmp, MetroHash64<metrohash64_1>>(data, "StringRef_MetroHash64_1");