mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-12-04 21:42:39 +00:00
Remove AVX2 to figure out where is the illegal intruction
Enable AVX2 - int32
This commit is contained in:
parent
62487fe2fc
commit
a810ce5dcb
@ -12,7 +12,6 @@
|
||||
#if defined(__AVX2__)
|
||||
#include <immintrin.h>
|
||||
#endif
|
||||
|
||||
namespace DB::GatherUtils
|
||||
{
|
||||
|
||||
@ -236,240 +235,6 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll<ArraySearchType::All, NumericArrayS
|
||||
return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map);
|
||||
}
|
||||
|
||||
// TODO: Discuss about
|
||||
// raise an error : "error: no viable conversion from 'const NumericArraySlice<unsigned int>' to 'const NumericArraySlice<int>'"
|
||||
// How should we do, copy past each function ?? I haven't found a way to specialize a same function body for two different types.
|
||||
// AVX2 UInt specialization
|
||||
// template <>
|
||||
// inline ALWAYS_INLINE bool sliceHasImplAnyAll<ArraySearchType::All, NumericArraySlice<unsigned>, NumericArraySlice<unsigned>, sliceEqualElements<unsigned,unsigned> >(
|
||||
// const NumericArraySlice<unsigned> & second, const NumericArraySlice<unsigned> & first, const UInt8 * first_null_map, const UInt8 * second_null_map)
|
||||
// {
|
||||
// return sliceHasImplAnyAll<ArraySearchType::All, NumericArraySlice<int>, NumericArraySlice<int>, sliceEqualElements<int,int> > (
|
||||
// static_cast<const NumericArraySlice<int> &>(second), static_cast<const NumericArraySlice<int> &>(first), second_null_map, first_null_map);
|
||||
// }
|
||||
|
||||
// AVX2 Int64 specialization
|
||||
template <>
|
||||
inline ALWAYS_INLINE bool sliceHasImplAnyAll<ArraySearchType::All, NumericArraySlice<Int64>, NumericArraySlice<Int64>, sliceEqualElements<Int64,Int64> >(
|
||||
const NumericArraySlice<Int64> & first, const NumericArraySlice<Int64> & second, const UInt8 * first_null_map, const UInt8 * second_null_map)
|
||||
{
|
||||
if (second.size == 0)
|
||||
return true;
|
||||
|
||||
if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size))
|
||||
return false;
|
||||
|
||||
const bool has_first_null_map = first_null_map != nullptr;
|
||||
const bool has_second_null_map = second_null_map != nullptr;
|
||||
|
||||
size_t j = 0;
|
||||
short has_mask = 1;
|
||||
const Int64 full = -1, none = 0;
|
||||
const __m256i ones = _mm256_set1_epi64x(full);
|
||||
const __m256i zeros = _mm256_setzero_si256();
|
||||
if (second.size > 3 && first.size > 3)
|
||||
{
|
||||
for (; j < second.size - 3 && has_mask; j += 4)
|
||||
{
|
||||
has_mask = 0;
|
||||
const __m256i f_data = _mm256_lddqu_si256(reinterpret_cast<const __m256i*>(second.data + j));
|
||||
// bits of the bitmask are set to one if considered as null in the corresponding null map, 0 otherwise;
|
||||
__m256i bitmask = has_second_null_map ?
|
||||
_mm256_set_epi64x(
|
||||
(second_null_map[j + 3])? full : none,
|
||||
(second_null_map[j + 2])? full : none,
|
||||
(second_null_map[j + 1])? full : none,
|
||||
(second_null_map[j]) ? full : none)
|
||||
: zeros;
|
||||
|
||||
unsigned i = 0;
|
||||
for (; i < first.size - 3 && !has_mask; has_mask = _mm256_testc_si256(bitmask, ones), i += 4)
|
||||
{
|
||||
const __m256i s_data = _mm256_lddqu_si256(reinterpret_cast<const __m256i*>(first.data + i));
|
||||
const __m256i first_nm_mask = has_first_null_map?
|
||||
_mm256_set_m128i(
|
||||
_mm_cvtepi8_epi64(_mm_lddqu_si128(reinterpret_cast<const __m128i *>(first_null_map + i + 2))),
|
||||
_mm_cvtepi8_epi64(_mm_lddqu_si128(reinterpret_cast<const __m128i *>(first_null_map + i))))
|
||||
: zeros;
|
||||
bitmask =
|
||||
_mm256_or_si256(
|
||||
_mm256_or_si256(
|
||||
_mm256_or_si256(
|
||||
_mm256_andnot_si256(
|
||||
first_nm_mask,
|
||||
_mm256_cmpeq_epi64(f_data, s_data)),
|
||||
_mm256_andnot_si256(
|
||||
_mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(5,4,3,2,1,0,7,6)),
|
||||
_mm256_cmpeq_epi64(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(5,4,3,2,1,0,7,6))))),
|
||||
|
||||
_mm256_or_si256(
|
||||
_mm256_andnot_si256(
|
||||
_mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(3,2,1,0,7,6,5,4)),
|
||||
_mm256_cmpeq_epi64(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(3,2,1,0,7,6,5,4)))),
|
||||
_mm256_andnot_si256(
|
||||
_mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(1,0,7,6,5,4,3,2)),
|
||||
_mm256_cmpeq_epi64(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(1,0,7,6,5,4,3,2)))))),
|
||||
bitmask);
|
||||
}
|
||||
|
||||
if (i < first.size)
|
||||
{
|
||||
for (; i < first.size && !has_mask; ++i)
|
||||
{
|
||||
if (has_first_null_map && first_null_map[i])
|
||||
continue;
|
||||
__m256i v_i = _mm256_set1_epi64x(first.data[i]);
|
||||
bitmask = _mm256_or_si256(bitmask, _mm256_cmpeq_epi64(f_data, v_i));
|
||||
has_mask = _mm256_testc_si256(bitmask, ones);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!has_mask && second.size > 3)
|
||||
return false;
|
||||
|
||||
return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map);
|
||||
}
|
||||
|
||||
// AVX2 Int16_t specialization
|
||||
template <>
|
||||
inline ALWAYS_INLINE bool sliceHasImplAnyAll<ArraySearchType::All, NumericArraySlice<int16_t>, NumericArraySlice<int16_t>, sliceEqualElements<int16_t,int16_t> >(
|
||||
const NumericArraySlice<int16_t> & first, const NumericArraySlice<int16_t> & second, const UInt8 * first_null_map, const UInt8 * second_null_map)
|
||||
{
|
||||
if (second.size == 0)
|
||||
return true;
|
||||
|
||||
if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size))
|
||||
return false;
|
||||
|
||||
const bool has_first_null_map = first_null_map != nullptr;
|
||||
const bool has_second_null_map = second_null_map != nullptr;
|
||||
|
||||
size_t j = 0;
|
||||
short has_mask = 1;
|
||||
const int16_t full = -1, none = 0;
|
||||
const __m256i ones = _mm256_set1_epi16(full);
|
||||
const __m256i zeros = _mm256_setzero_si256();
|
||||
if (second.size > 15 && first.size > 15)
|
||||
{
|
||||
for (; j < second.size - 15 && has_mask; j += 16)
|
||||
{
|
||||
has_mask = 0;
|
||||
const __m256i f_data = _mm256_lddqu_si256(reinterpret_cast<const __m256i*>(second.data + j));
|
||||
__m256i bitmask = has_second_null_map ?
|
||||
_mm256_set_epi16(
|
||||
(second_null_map[j + 15]) ? full : none, (second_null_map[j + 14]) ? full : none,
|
||||
(second_null_map[j + 13]) ? full : none, (second_null_map[j + 12]) ? full : none,
|
||||
(second_null_map[j + 11]) ? full : none, (second_null_map[j + 10]) ? full : none,
|
||||
(second_null_map[j + 9]) ? full : none, (second_null_map[j + 8])? full : none,
|
||||
(second_null_map[j + 7]) ? full : none, (second_null_map[j + 6])? full : none,
|
||||
(second_null_map[j + 5]) ? full : none, (second_null_map[j + 4])? full : none,
|
||||
(second_null_map[j + 3]) ? full : none, (second_null_map[j + 2])? full : none,
|
||||
(second_null_map[j + 1]) ? full : none, (second_null_map[j]) ? full : none)
|
||||
: zeros;
|
||||
unsigned i = 0;
|
||||
for (; i < first.size - 15 && !has_mask; has_mask = _mm256_testc_si256(bitmask, ones), i += 16)
|
||||
{
|
||||
const __m256i s_data = _mm256_lddqu_si256(reinterpret_cast<const __m256i*>(first.data + i));
|
||||
const __m256i first_nm_mask = has_first_null_map?
|
||||
_mm256_set_m128i(
|
||||
_mm_cvtepi8_epi16(_mm_lddqu_si128(reinterpret_cast<const __m128i *>(first_null_map + i + 8))),
|
||||
_mm_cvtepi8_epi16(_mm_lddqu_si128(reinterpret_cast<const __m128i *>(first_null_map + i))))
|
||||
: zeros;
|
||||
bitmask =
|
||||
_mm256_or_si256(
|
||||
_mm256_or_si256(
|
||||
_mm256_or_si256(
|
||||
_mm256_or_si256(
|
||||
_mm256_or_si256(
|
||||
_mm256_andnot_si256(
|
||||
first_nm_mask,
|
||||
_mm256_cmpeq_epi16(f_data, s_data)),
|
||||
_mm256_andnot_si256(
|
||||
_mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30)),
|
||||
_mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30))))),
|
||||
_mm256_or_si256(
|
||||
_mm256_andnot_si256(
|
||||
_mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28)),
|
||||
_mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28)))),
|
||||
_mm256_andnot_si256(
|
||||
_mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26)),
|
||||
_mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26)))))
|
||||
),
|
||||
_mm256_or_si256(
|
||||
_mm256_or_si256(
|
||||
_mm256_andnot_si256(
|
||||
_mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24)),
|
||||
_mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24)))),
|
||||
_mm256_andnot_si256(
|
||||
_mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22)),
|
||||
_mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22))))),
|
||||
_mm256_or_si256(
|
||||
_mm256_andnot_si256(
|
||||
_mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20)),
|
||||
_mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20)))),
|
||||
_mm256_andnot_si256(
|
||||
_mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18)),
|
||||
_mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18))))))
|
||||
),
|
||||
_mm256_or_si256(
|
||||
_mm256_or_si256(
|
||||
_mm256_or_si256(
|
||||
_mm256_andnot_si256(
|
||||
_mm256_permute2x128_si256(first_nm_mask, first_nm_mask,1),
|
||||
_mm256_cmpeq_epi16(f_data, _mm256_permute2x128_si256(s_data, s_data, 1))),
|
||||
_mm256_andnot_si256(
|
||||
_mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14)),
|
||||
_mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data, s_data, 1), _mm256_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14))))),
|
||||
_mm256_or_si256(
|
||||
_mm256_andnot_si256(
|
||||
_mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12)),
|
||||
_mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data, s_data, 1), _mm256_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12)))),
|
||||
_mm256_andnot_si256(
|
||||
_mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10)),
|
||||
_mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data, s_data, 1), _mm256_set_epi8(9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10)))))
|
||||
),
|
||||
_mm256_or_si256(
|
||||
_mm256_or_si256(
|
||||
_mm256_andnot_si256(
|
||||
_mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8)),
|
||||
_mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data ,s_data, 1), _mm256_set_epi8(7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8)))),
|
||||
_mm256_andnot_si256(
|
||||
_mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6)),
|
||||
_mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data, s_data, 1), _mm256_set_epi8(5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6))))),
|
||||
_mm256_or_si256(
|
||||
_mm256_andnot_si256(
|
||||
_mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4)),
|
||||
_mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data ,s_data ,1), _mm256_set_epi8(3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4)))),
|
||||
_mm256_andnot_si256(
|
||||
_mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2)),
|
||||
_mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data, s_data, 1), _mm256_set_epi8(1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2))))))
|
||||
)
|
||||
),
|
||||
bitmask);
|
||||
}
|
||||
|
||||
if (i < first.size)
|
||||
{
|
||||
for (; i < first.size && !has_mask; ++i)
|
||||
{
|
||||
if (has_first_null_map && first_null_map[i])
|
||||
continue;
|
||||
__m256i v_i = _mm256_set1_epi16(first.data[i]);
|
||||
bitmask = _mm256_or_si256(bitmask, _mm256_cmpeq_epi16(f_data, v_i));
|
||||
has_mask = _mm256_testc_si256(bitmask, ones);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!has_mask && second.size > 15)
|
||||
return false;
|
||||
|
||||
return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map);
|
||||
}
|
||||
|
||||
#elif defined(__SSE4_2__)
|
||||
|
||||
// SSE4.2 Int specialization
|
||||
|
Loading…
Reference in New Issue
Block a user