mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-23 08:02:02 +00:00
dbms: porting to aarch64 [#METR-19609].
This commit is contained in:
parent
fefce00f5d
commit
e513e9808b
@ -10,6 +10,10 @@
|
||||
|
||||
#include <DB/Columns/IColumn.h>
|
||||
|
||||
#if defined(__x86_64__)
|
||||
#include <emmintrin.h>
|
||||
#endif
|
||||
|
||||
|
||||
namespace DB
|
||||
{
|
||||
@ -288,17 +292,20 @@ public:
|
||||
if (result_size_hint)
|
||||
res_data.reserve(result_size_hint > 0 ? result_size_hint : size);
|
||||
|
||||
const UInt8 * filt_pos = &filt[0];
|
||||
const UInt8 * filt_end = filt_pos + size;
|
||||
const T * data_pos = &data[0];
|
||||
|
||||
#if defined(__x86_64__)
|
||||
/** Чуть более оптимизированная версия.
|
||||
* Исходит из допущения, что часто куски последовательно идущих значений
|
||||
* полностью проходят или полностью не проходят фильтр.
|
||||
* Поэтому, будем оптимистично проверять куски по 16 значений.
|
||||
* Поэтому, будем оптимистично проверять куски по SIMD_BYTES значений.
|
||||
*/
|
||||
const UInt8 * filt_pos = &filt[0];
|
||||
const UInt8 * filt_end = filt_pos + size;
|
||||
const UInt8 * filt_end_sse = filt_pos + size / 16 * 16;
|
||||
const T * data_pos = &data[0];
|
||||
|
||||
static constexpr size_t SIMD_BYTES = 16;
|
||||
const __m128i zero16 = _mm_setzero_si128();
|
||||
const UInt8 * filt_end_sse = filt_pos + size / SIMD_BYTES * SIMD_BYTES;
|
||||
|
||||
while (filt_pos < filt_end_sse)
|
||||
{
|
||||
@ -310,18 +317,19 @@ public:
|
||||
}
|
||||
else if (0xFFFF == mask)
|
||||
{
|
||||
res_data.insert(data_pos, data_pos + 16);
|
||||
res_data.insert(data_pos, data_pos + SIMD_BYTES);
|
||||
}
|
||||
else
|
||||
{
|
||||
for (size_t i = 0; i < 16; ++i)
|
||||
for (size_t i = 0; i < SIMD_BYTES; ++i)
|
||||
if (filt_pos[i])
|
||||
res_data.push_back(data_pos[i]);
|
||||
}
|
||||
|
||||
filt_pos += 16;
|
||||
data_pos += 16;
|
||||
filt_pos += SIMD_BYTES;
|
||||
data_pos += SIMD_BYTES;
|
||||
}
|
||||
#endif
|
||||
|
||||
while (filt_pos < filt_end)
|
||||
{
|
||||
|
@ -913,7 +913,10 @@ template <> struct FunctionUnaryArithmeticMonotonicity<NameBitNot>
|
||||
|
||||
/// Оптимизации для целочисленного деления на константу.
|
||||
|
||||
#if defined(__x86_64__)
|
||||
#define LIBDIVIDE_USE_SSE2 1
|
||||
#endif
|
||||
|
||||
#include <libdivide.h>
|
||||
|
||||
|
||||
@ -947,6 +950,8 @@ struct DivideIntegralByConstantImpl
|
||||
const A * a_pos = &a[0];
|
||||
const A * a_end = a_pos + size;
|
||||
ResultType * c_pos = &c[0];
|
||||
|
||||
#if defined(__x86_64__)
|
||||
static constexpr size_t values_per_sse_register = 16 / sizeof(A);
|
||||
const A * a_end_sse = a_pos + size / values_per_sse_register * values_per_sse_register;
|
||||
|
||||
@ -958,6 +963,7 @@ struct DivideIntegralByConstantImpl
|
||||
a_pos += values_per_sse_register;
|
||||
c_pos += values_per_sse_register;
|
||||
}
|
||||
#endif
|
||||
|
||||
while (a_pos < a_end)
|
||||
{
|
||||
|
@ -17,8 +17,10 @@
|
||||
#include <DB/Functions/IFunction.h>
|
||||
#include <ext/range.hpp>
|
||||
|
||||
#if defined(__x86_64__)
|
||||
#include <emmintrin.h>
|
||||
#include <nmmintrin.h>
|
||||
#endif
|
||||
|
||||
|
||||
namespace DB
|
||||
@ -233,11 +235,12 @@ struct LowerUpperImpl
|
||||
private:
|
||||
static void array(const UInt8 * src, const UInt8 * src_end, UInt8 * dst)
|
||||
{
|
||||
const auto flip_case_mask = 'A' ^ 'a';
|
||||
|
||||
#if defined(__x86_64__)
|
||||
const auto bytes_sse = sizeof(__m128i);
|
||||
const auto src_end_sse = src_end - (src_end - src) % bytes_sse;
|
||||
|
||||
const auto flip_case_mask = 'A' ^ 'a';
|
||||
|
||||
const auto v_not_case_lower_bound = _mm_set1_epi8(not_case_lower_bound - 1);
|
||||
const auto v_not_case_upper_bound = _mm_set1_epi8(not_case_upper_bound + 1);
|
||||
const auto v_flip_case_mask = _mm_set1_epi8(flip_case_mask);
|
||||
@ -260,6 +263,7 @@ private:
|
||||
/// store result back to destination
|
||||
_mm_storeu_si128(reinterpret_cast<__m128i *>(dst), cased_chars);
|
||||
}
|
||||
#endif
|
||||
|
||||
for (; src < src_end; ++src, ++dst)
|
||||
if (*src >= not_case_lower_bound && *src <= not_case_upper_bound)
|
||||
@ -394,6 +398,7 @@ private:
|
||||
|
||||
static void array(const UInt8 * src, const UInt8 * src_end, UInt8 * dst)
|
||||
{
|
||||
#if defined(__x86_64__)
|
||||
const auto bytes_sse = sizeof(__m128i);
|
||||
auto src_end_sse = src + (src_end - src) / bytes_sse * bytes_sse;
|
||||
|
||||
@ -455,7 +460,7 @@ private:
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
/// handle remaining symbols
|
||||
while (src < src_end)
|
||||
toCase(src, src_end, dst);
|
||||
|
@ -1,4 +1,6 @@
|
||||
#if defined(__x86_64__)
|
||||
#include <emmintrin.h>
|
||||
#endif
|
||||
|
||||
#include <DB/Columns/IColumn.h>
|
||||
|
||||
@ -15,10 +17,11 @@ size_t countBytesInFilter(const IColumn::Filter & filt)
|
||||
* Лучше было бы использовать != 0, то это не позволяет SSE2.
|
||||
*/
|
||||
|
||||
const __m128i zero16 = _mm_setzero_si128();
|
||||
|
||||
const Int8 * pos = reinterpret_cast<const Int8 *>(&filt[0]);
|
||||
const Int8 * end = pos + filt.size();
|
||||
|
||||
#if defined(__x86_64__)
|
||||
const __m128i zero16 = _mm_setzero_si128();
|
||||
const Int8 * end64 = pos + filt.size() / 64 * 64;
|
||||
|
||||
for (; pos < end64; pos += 64)
|
||||
@ -35,6 +38,7 @@ size_t countBytesInFilter(const IColumn::Filter & filt)
|
||||
| (static_cast<UInt64>(_mm_movemask_epi8(_mm_cmpgt_epi8(
|
||||
_mm_loadu_si128(reinterpret_cast<const __m128i *>(pos + 48)),
|
||||
zero16))) << 48));
|
||||
#endif
|
||||
|
||||
for (; pos < end; ++pos)
|
||||
count += *pos > 0;
|
||||
@ -71,17 +75,12 @@ void filterArraysImpl(
|
||||
|
||||
IColumn::Offset_t current_src_offset = 0;
|
||||
|
||||
static constexpr size_t SIMD_BYTES = 16;
|
||||
|
||||
const UInt8 * filt_pos = &filt[0];
|
||||
const auto filt_end = filt_pos + size;
|
||||
const auto filt_end_aligned = filt_pos + size / SIMD_BYTES * SIMD_BYTES;
|
||||
|
||||
auto offsets_pos = &src_offsets[0];
|
||||
const auto offsets_begin = offsets_pos;
|
||||
|
||||
const __m128i zero_vec = _mm_setzero_si128();
|
||||
|
||||
/// copy array ending at *end_offset_ptr
|
||||
const auto copy_array = [&] (const IColumn::Offset_t * offset_ptr)
|
||||
{
|
||||
@ -96,6 +95,11 @@ void filterArraysImpl(
|
||||
memcpy(&res_elems[elems_size_old], &src_elems[offset], size * sizeof(T));
|
||||
};
|
||||
|
||||
#if defined(__x86_64__)
|
||||
const __m128i zero_vec = _mm_setzero_si128();
|
||||
static constexpr size_t SIMD_BYTES = 16;
|
||||
const auto filt_end_aligned = filt_pos + size / SIMD_BYTES * SIMD_BYTES;
|
||||
|
||||
while (filt_pos < filt_end_aligned)
|
||||
{
|
||||
const auto mask = _mm_movemask_epi8(_mm_cmpgt_epi8(
|
||||
@ -149,6 +153,7 @@ void filterArraysImpl(
|
||||
filt_pos += SIMD_BYTES;
|
||||
offsets_pos += SIMD_BYTES;
|
||||
}
|
||||
#endif
|
||||
|
||||
while (filt_pos < filt_end)
|
||||
{
|
||||
|
@ -12,7 +12,9 @@
|
||||
#include <DB/IO/WriteHelpers.h>
|
||||
#include <DB/IO/VarInt.h>
|
||||
|
||||
#if defined(__x86_64__)
|
||||
#include <emmintrin.h>
|
||||
#endif
|
||||
|
||||
|
||||
namespace DB
|
||||
@ -91,6 +93,7 @@ static NO_INLINE void deserializeBinarySSE2(ColumnString::Chars_t & data, Column
|
||||
|
||||
if (size)
|
||||
{
|
||||
#if defined(__x86_64__)
|
||||
/// Оптимистичная ветка, в которой возможно более эффективное копирование.
|
||||
if (offset + 16 * UNROLL_TIMES <= data.capacity() && istr.position() + size + 16 * UNROLL_TIMES <= istr.buffer().end())
|
||||
{
|
||||
@ -121,6 +124,7 @@ static NO_INLINE void deserializeBinarySSE2(ColumnString::Chars_t & data, Column
|
||||
istr.position() += size;
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
istr.readStrict(reinterpret_cast<char*>(&data[offset - size - 1]), size);
|
||||
}
|
||||
|
@ -1,4 +1,6 @@
|
||||
#if defined(__x86_64__)
|
||||
#include <emmintrin.h>
|
||||
#endif
|
||||
|
||||
#include <sstream>
|
||||
|
||||
@ -112,6 +114,7 @@ void readStringUntilEOF(String & s, ReadBuffer & buf)
|
||||
*/
|
||||
static inline const char * find_first_tab_lf_or_backslash(const char * begin, const char * end)
|
||||
{
|
||||
#if defined(__x86_64__)
|
||||
static const char tab_chars[16] = {'\t', '\t', '\t', '\t', '\t', '\t', '\t', '\t', '\t', '\t', '\t', '\t', '\t', '\t', '\t', '\t'};
|
||||
static const char lf_chars[16] = {'\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'};
|
||||
static const char bs_chars[16] = {'\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'};
|
||||
@ -140,6 +143,7 @@ static inline const char * find_first_tab_lf_or_backslash(const char * begin, co
|
||||
if (bit_mask)
|
||||
return begin + __builtin_ctz(bit_mask);
|
||||
}
|
||||
#endif
|
||||
|
||||
for (; begin < end; ++begin)
|
||||
if (*begin == '\t' || *begin == '\n' || *begin == '\\')
|
||||
@ -202,6 +206,7 @@ void readEscapedString(DB::String & s, DB::ReadBuffer & buf)
|
||||
template <char quote>
|
||||
static inline const char * find_first_quote_or_backslash(const char * begin, const char * end)
|
||||
{
|
||||
#if defined(__x86_64__)
|
||||
static const char quote_chars[16] = {quote, quote, quote, quote, quote, quote, quote, quote, quote, quote, quote, quote, quote, quote, quote, quote};
|
||||
static const char bs_chars[16] = {'\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' };
|
||||
|
||||
@ -226,6 +231,7 @@ static inline const char * find_first_quote_or_backslash(const char * begin, con
|
||||
if (bit_mask)
|
||||
return begin + __builtin_ctz(bit_mask);
|
||||
}
|
||||
#endif
|
||||
|
||||
for (; begin < end; ++begin)
|
||||
if (*begin == quote || *begin == '\\')
|
||||
|
@ -1,4 +1,6 @@
|
||||
#if defined(__x86_64__)
|
||||
#include <emmintrin.h>
|
||||
#endif
|
||||
|
||||
#include <string>
|
||||
#include <iostream>
|
||||
@ -35,6 +37,7 @@ namespace test
|
||||
*/
|
||||
static inline const char * find_first_tab_lf_or_backslash(const char * begin, const char * end)
|
||||
{
|
||||
#if defined(__x86_64__)
|
||||
static const char tab_chars[16] = {'\t', '\t', '\t', '\t', '\t', '\t', '\t', '\t', '\t', '\t', '\t', '\t', '\t', '\t', '\t', '\t'};
|
||||
static const char lf_chars[16] = {'\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'};
|
||||
static const char bs_chars[16] = {'\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'};
|
||||
@ -63,7 +66,7 @@ namespace test
|
||||
if (bit_mask)
|
||||
return begin + __builtin_ctz(bit_mask);
|
||||
}
|
||||
|
||||
#endif
|
||||
for (; begin < end; ++begin)
|
||||
if (*begin == '\t' || *begin == '\n' || *begin == '\\')
|
||||
return begin;
|
||||
|
@ -1,4 +1,6 @@
|
||||
#if defined(__x86_64__)
|
||||
#include <emmintrin.h>
|
||||
#endif
|
||||
|
||||
#include <iostream>
|
||||
#include <iomanip>
|
||||
@ -9,6 +11,7 @@
|
||||
#include <DB/Common/Stopwatch.h>
|
||||
|
||||
|
||||
#if defined(__x86_64__)
|
||||
std::ostream & operator<< (std::ostream & ostr, const __m128i vec)
|
||||
{
|
||||
char digits[16];
|
||||
@ -21,6 +24,7 @@ std::ostream & operator<< (std::ostream & ostr, const __m128i vec)
|
||||
|
||||
return ostr;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
namespace test
|
||||
|
@ -15,7 +15,9 @@
|
||||
#include <DB/Common/HashTable/HashMap.h>
|
||||
#include <DB/Interpreters/AggregationCommon.h>
|
||||
|
||||
#if defined(__x86_64__)
|
||||
#include <smmintrin.h>
|
||||
#endif
|
||||
|
||||
|
||||
/** Выполнять так:
|
||||
@ -74,11 +76,15 @@ DefineStringRef(StringRef_Compare8_1_byUInt64)
|
||||
DefineStringRef(StringRef_Compare16_1_byMemcmp)
|
||||
DefineStringRef(StringRef_Compare16_1_byUInt64_logicAnd)
|
||||
DefineStringRef(StringRef_Compare16_1_byUInt64_bitAnd)
|
||||
|
||||
#if defined(__x86_64__)
|
||||
DefineStringRef(StringRef_Compare16_1_byIntSSE)
|
||||
DefineStringRef(StringRef_Compare16_1_byFloatSSE)
|
||||
DefineStringRef(StringRef_Compare16_1_bySSE4)
|
||||
DefineStringRef(StringRef_Compare16_1_bySSE4_wide)
|
||||
DefineStringRef(StringRef_Compare16_1_bySSE_wide)
|
||||
#endif
|
||||
|
||||
DefineStringRef(StringRef_CompareAlwaysTrue)
|
||||
DefineStringRef(StringRef_CompareAlmostAlwaysTrue)
|
||||
|
||||
@ -190,6 +196,8 @@ inline bool compare_byUInt64_bitAnd(const char * p1, const char * p2)
|
||||
& (reinterpret_cast<const uint64_t *>(p1)[1] == reinterpret_cast<const uint64_t *>(p2)[1]);
|
||||
}
|
||||
|
||||
#if defined(__x86_64__)
|
||||
|
||||
inline bool compare_byIntSSE(const char * p1, const char * p2)
|
||||
{
|
||||
return 0xFFFF == _mm_movemask_epi8(_mm_cmpeq_epi8(
|
||||
@ -204,6 +212,8 @@ inline bool compare_byFloatSSE(const char * p1, const char * p2)
|
||||
_mm_loadu_ps(reinterpret_cast<const float *>(p2))));
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
template <bool compare(const char *, const char *)>
|
||||
inline bool memequal(const char * p1, const char * p2, size_t size)
|
||||
@ -253,6 +263,8 @@ inline bool memequal(const char * p1, const char * p2, size_t size)
|
||||
}
|
||||
|
||||
|
||||
#if defined(__x86_64__)
|
||||
|
||||
inline bool memequal_sse41(const char * p1, const char * p2, size_t size)
|
||||
{
|
||||
// const char * p1_end = p1 + size;
|
||||
@ -483,6 +495,8 @@ inline bool memequal_sse_wide(const char * p1, const char * p2, size_t size)
|
||||
return true;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#define Op(METHOD) \
|
||||
inline bool operator==(StringRef_Compare16_1_ ## METHOD lhs, StringRef_Compare16_1_ ## METHOD rhs) \
|
||||
@ -499,6 +513,9 @@ inline bool operator==(StringRef_Compare16_1_ ## METHOD lhs, StringRef_Compare16
|
||||
Op(byMemcmp)
|
||||
Op(byUInt64_logicAnd)
|
||||
Op(byUInt64_bitAnd)
|
||||
|
||||
#if defined(__x86_64__)
|
||||
|
||||
Op(byIntSSE)
|
||||
Op(byFloatSSE)
|
||||
|
||||
@ -536,6 +553,8 @@ inline bool operator==(StringRef_Compare16_1_bySSE_wide lhs, StringRef_Compare16
|
||||
return memequal_sse_wide(lhs.data, rhs.data, lhs.size);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
inline bool operator==(StringRef_CompareAlwaysTrue lhs, StringRef_CompareAlwaysTrue rhs)
|
||||
{
|
||||
@ -623,11 +642,13 @@ int main(int argc, char ** argv)
|
||||
if (!m || m == 5) bench<StringRef_Compare16_1_byMemcmp> (data, "StringRef_Compare16_1_byMemcmp");
|
||||
if (!m || m == 6) bench<StringRef_Compare16_1_byUInt64_logicAnd>(data, "StringRef_Compare16_1_byUInt64_logicAnd");
|
||||
if (!m || m == 7) bench<StringRef_Compare16_1_byUInt64_bitAnd> (data, "StringRef_Compare16_1_byUInt64_bitAnd");
|
||||
#if defined(__x86_64__)
|
||||
if (!m || m == 8) bench<StringRef_Compare16_1_byIntSSE> (data, "StringRef_Compare16_1_byIntSSE");
|
||||
if (!m || m == 9) bench<StringRef_Compare16_1_byFloatSSE> (data, "StringRef_Compare16_1_byFloatSSE");
|
||||
if (!m || m == 10) bench<StringRef_Compare16_1_bySSE4> (data, "StringRef_Compare16_1_bySSE4");
|
||||
if (!m || m == 11) bench<StringRef_Compare16_1_bySSE4_wide> (data, "StringRef_Compare16_1_bySSE4_wide");
|
||||
if (!m || m == 12) bench<StringRef_Compare16_1_bySSE_wide> (data, "StringRef_Compare16_1_bySSE_wide");
|
||||
#endif
|
||||
if (!m || m == 100) bench<StringRef_CompareAlwaysTrue> (data, "StringRef_CompareAlwaysTrue");
|
||||
if (!m || m == 101) bench<StringRef_CompareAlmostAlwaysTrue> (data, "StringRef_CompareAlmostAlwaysTrue");
|
||||
|
||||
|
@ -18,7 +18,9 @@
|
||||
#include <DB/Common/HashTable/HashMap.h>
|
||||
#include <DB/Interpreters/AggregationCommon.h>
|
||||
|
||||
#if defined(__x86_64__)
|
||||
#include <smmintrin.h>
|
||||
#endif
|
||||
|
||||
|
||||
/** Выполнять так:
|
||||
@ -137,6 +139,8 @@ struct FastHash64
|
||||
};
|
||||
|
||||
|
||||
#if defined(__x86_64__)
|
||||
|
||||
struct CrapWow
|
||||
{
|
||||
size_t operator() (StringRef x) const
|
||||
@ -206,6 +210,8 @@ struct CrapWow
|
||||
}
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
struct SimpleHash
|
||||
{
|
||||
@ -306,6 +312,8 @@ struct MetroHash64
|
||||
};
|
||||
|
||||
|
||||
#if defined(__x86_64__)
|
||||
|
||||
/*struct CRC32Hash
|
||||
{
|
||||
size_t operator() (StringRef x) const
|
||||
@ -383,6 +391,8 @@ struct CRC32ILPHash
|
||||
}
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
typedef UInt64 Value;
|
||||
|
||||
@ -451,9 +461,13 @@ int main(int argc, char ** argv)
|
||||
if (!m || m == 1) bench<StringRef_CompareMemcmp, DefaultHash<StringRef>>(data, "StringRef_CityHash64");
|
||||
if (!m || m == 2) bench<StringRef_CompareMemcmp, FastHash64> (data, "StringRef_FastHash64");
|
||||
if (!m || m == 3) bench<StringRef_CompareMemcmp, SimpleHash> (data, "StringRef_SimpleHash");
|
||||
|
||||
#if defined(__x86_64__)
|
||||
if (!m || m == 4) bench<StringRef_CompareMemcmp, CrapWow> (data, "StringRef_CrapWow");
|
||||
if (!m || m == 5) bench<StringRef_CompareMemcmp, CRC32Hash> (data, "StringRef_CRC32Hash");
|
||||
if (!m || m == 6) bench<StringRef_CompareMemcmp, CRC32ILPHash> (data, "StringRef_CRC32ILPHash");
|
||||
#endif
|
||||
|
||||
if (!m || m == 7) bench<StringRef_CompareMemcmp, VerySimpleHash>(data, "StringRef_VerySimpleHash");
|
||||
if (!m || m == 8) bench<StringRef_CompareMemcmp, FarmHash64>(data, "StringRef_FarmHash64");
|
||||
if (!m || m == 9) bench<StringRef_CompareMemcmp, MetroHash64<metrohash64_1>>(data, "StringRef_MetroHash64_1");
|
||||
|
Loading…
Reference in New Issue
Block a user