mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-23 08:02:02 +00:00
Merge pull request #35723 from ClickHouse/array-has-all-sse-avx2-optimizations
Merging #27653
This commit is contained in:
commit
b160ffd726
@ -49,6 +49,18 @@ if (COMPILER_GCC)
|
|||||||
add_definitions ("-fno-tree-loop-distribute-patterns")
|
add_definitions ("-fno-tree-loop-distribute-patterns")
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
|
# ClickHouse developers may use platform-dependent code under some macro (e.g. `#ifdef ENABLE_MULTITARGET`).
|
||||||
|
# If turned ON, this option defines such macro.
|
||||||
|
# See `src/Common/TargetSpecific.h`
|
||||||
|
option(ENABLE_MULTITARGET_CODE "Enable platform-dependent code" ON)
|
||||||
|
|
||||||
|
if (ENABLE_MULTITARGET_CODE)
|
||||||
|
add_definitions(-DENABLE_MULTITARGET_CODE=1)
|
||||||
|
else()
|
||||||
|
add_definitions(-DENABLE_MULTITARGET_CODE=0)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
|
||||||
add_subdirectory (Access)
|
add_subdirectory (Access)
|
||||||
add_subdirectory (Backups)
|
add_subdirectory (Backups)
|
||||||
add_subdirectory (Columns)
|
add_subdirectory (Columns)
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
#include <Functions/TargetSpecific.h>
|
#include <Common/TargetSpecific.h>
|
||||||
|
|
||||||
#include <Common/CpuId.h>
|
#include <Common/CpuId.h>
|
||||||
|
|
@ -96,17 +96,6 @@ if (TARGET ch_contrib::rapidjson)
|
|||||||
target_link_libraries(clickhouse_functions PRIVATE ch_contrib::rapidjson)
|
target_link_libraries(clickhouse_functions PRIVATE ch_contrib::rapidjson)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# ClickHouse developers may use platform-dependent code under some macro (e.g. `#ifdef ENABLE_MULTITARGET`).
|
|
||||||
# If turned ON, this option defines such macro.
|
|
||||||
# See `src/Functions/TargetSpecific.h`
|
|
||||||
option(ENABLE_MULTITARGET_CODE "Enable platform-dependent code" ON)
|
|
||||||
|
|
||||||
if (ENABLE_MULTITARGET_CODE)
|
|
||||||
add_definitions(-DENABLE_MULTITARGET_CODE=1)
|
|
||||||
else()
|
|
||||||
add_definitions(-DENABLE_MULTITARGET_CODE=0)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
add_subdirectory(GatherUtils)
|
add_subdirectory(GatherUtils)
|
||||||
target_link_libraries(clickhouse_functions PRIVATE clickhouse_functions_gatherutils)
|
target_link_libraries(clickhouse_functions PRIVATE clickhouse_functions_gatherutils)
|
||||||
|
|
||||||
|
@ -1,12 +1,12 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
#include <base/map.h>
|
#include <base/map.h>
|
||||||
|
|
||||||
|
#include <Common/TargetSpecific.h>
|
||||||
#include <Functions/FunctionHelpers.h>
|
#include <Functions/FunctionHelpers.h>
|
||||||
#include <Functions/GatherUtils/GatherUtils.h>
|
#include <Functions/GatherUtils/GatherUtils.h>
|
||||||
#include <Functions/GatherUtils/Sources.h>
|
#include <Functions/GatherUtils/Sources.h>
|
||||||
#include <Functions/IFunction.h>
|
#include <Functions/IFunction.h>
|
||||||
#include <Functions/PerformanceAdaptors.h>
|
#include <Functions/PerformanceAdaptors.h>
|
||||||
#include <Functions/TargetSpecific.h>
|
|
||||||
#include <DataTypes/DataTypeString.h>
|
#include <DataTypes/DataTypeString.h>
|
||||||
#include <DataTypes/DataTypesNumber.h>
|
#include <DataTypes/DataTypesNumber.h>
|
||||||
#include <DataTypes/getLeastSupertype.h>
|
#include <DataTypes/getLeastSupertype.h>
|
||||||
|
@ -38,8 +38,8 @@
|
|||||||
#include <Columns/ColumnTuple.h>
|
#include <Columns/ColumnTuple.h>
|
||||||
#include <Functions/IFunction.h>
|
#include <Functions/IFunction.h>
|
||||||
#include <Functions/FunctionHelpers.h>
|
#include <Functions/FunctionHelpers.h>
|
||||||
#include <Functions/TargetSpecific.h>
|
|
||||||
#include <Functions/PerformanceAdaptors.h>
|
#include <Functions/PerformanceAdaptors.h>
|
||||||
|
#include <Common/TargetSpecific.h>
|
||||||
#include <base/range.h>
|
#include <base/range.h>
|
||||||
#include <base/bit_cast.h>
|
#include <base/bit_cast.h>
|
||||||
|
|
||||||
|
@ -1,9 +1,9 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
|
#include <Common/TargetSpecific.h>
|
||||||
#include <DataTypes/DataTypesNumber.h>
|
#include <DataTypes/DataTypesNumber.h>
|
||||||
#include <Columns/ColumnVector.h>
|
#include <Columns/ColumnVector.h>
|
||||||
#include <Functions/IFunction.h>
|
#include <Functions/IFunction.h>
|
||||||
#include <Functions/TargetSpecific.h>
|
|
||||||
#include <Functions/PerformanceAdaptors.h>
|
#include <Functions/PerformanceAdaptors.h>
|
||||||
#include <IO/WriteHelpers.h>
|
#include <IO/WriteHelpers.h>
|
||||||
|
|
||||||
|
@ -7,6 +7,8 @@
|
|||||||
#include <Core/AccurateComparison.h>
|
#include <Core/AccurateComparison.h>
|
||||||
#include <base/range.h>
|
#include <base/range.h>
|
||||||
#include "GatherUtils.h"
|
#include "GatherUtils.h"
|
||||||
|
#include "sliceEqualElements.h"
|
||||||
|
#include "sliceHasImplAnyAll.h"
|
||||||
|
|
||||||
|
|
||||||
namespace DB::ErrorCodes
|
namespace DB::ErrorCodes
|
||||||
@ -461,39 +463,19 @@ void NO_INLINE conditional(SourceA && src_a, SourceB && src_b, Sink && sink, con
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/// Methods to check if first array has elements from second array, overloaded for various combinations of types.
|
template <typename T>
|
||||||
template <
|
bool insliceEqualElements(const NumericArraySlice<T> & first [[maybe_unused]],
|
||||||
ArraySearchType search_type,
|
size_t first_ind [[maybe_unused]],
|
||||||
typename FirstSliceType,
|
size_t second_ind [[maybe_unused]])
|
||||||
typename SecondSliceType,
|
|
||||||
bool (*isEqual)(const FirstSliceType &, const SecondSliceType &, size_t, size_t)>
|
|
||||||
bool sliceHasImplAnyAll(const FirstSliceType & first, const SecondSliceType & second, const UInt8 * first_null_map, const UInt8 * second_null_map)
|
|
||||||
{
|
{
|
||||||
const bool has_first_null_map = first_null_map != nullptr;
|
if constexpr (is_decimal<T>)
|
||||||
const bool has_second_null_map = second_null_map != nullptr;
|
return accurate::equalsOp(first.data[first_ind].value, first.data[second_ind].value);
|
||||||
|
else
|
||||||
for (size_t i = 0; i < second.size; ++i)
|
return accurate::equalsOp(first.data[first_ind], first.data[second_ind]);
|
||||||
{
|
}
|
||||||
bool has = false;
|
inline ALWAYS_INLINE bool insliceEqualElements(const GenericArraySlice & first, size_t first_ind, size_t second_ind)
|
||||||
for (size_t j = 0; j < first.size && !has; ++j)
|
{
|
||||||
{
|
return first.elements->compareAt(first_ind + first.begin, second_ind + first.begin, *first.elements, -1) == 0;
|
||||||
const bool is_first_null = has_first_null_map && first_null_map[j];
|
|
||||||
const bool is_second_null = has_second_null_map && second_null_map[i];
|
|
||||||
|
|
||||||
if (is_first_null && is_second_null)
|
|
||||||
has = true;
|
|
||||||
|
|
||||||
if (!is_first_null && !is_second_null && isEqual(first, second, j, i))
|
|
||||||
has = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (has && search_type == ArraySearchType::Any)
|
|
||||||
return true;
|
|
||||||
|
|
||||||
if (!has && search_type == ArraySearchType::All)
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
return search_type == ArraySearchType::All;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <
|
template <
|
||||||
@ -620,55 +602,6 @@ bool sliceHasImpl(const FirstSliceType & first, const SecondSliceType & second,
|
|||||||
return sliceHasImplAnyAll<search_type, FirstSliceType, SecondSliceType, isEqual>(first, second, first_null_map, second_null_map);
|
return sliceHasImplAnyAll<search_type, FirstSliceType, SecondSliceType, isEqual>(first, second, first_null_map, second_null_map);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
template <typename T, typename U>
|
|
||||||
bool sliceEqualElements(const NumericArraySlice<T> & first [[maybe_unused]],
|
|
||||||
const NumericArraySlice<U> & second [[maybe_unused]],
|
|
||||||
size_t first_ind [[maybe_unused]],
|
|
||||||
size_t second_ind [[maybe_unused]])
|
|
||||||
{
|
|
||||||
/// TODO: Decimal scale
|
|
||||||
if constexpr (is_decimal<T> && is_decimal<U>)
|
|
||||||
return accurate::equalsOp(first.data[first_ind].value, second.data[second_ind].value);
|
|
||||||
else if constexpr (is_decimal<T> || is_decimal<U>)
|
|
||||||
return false;
|
|
||||||
else
|
|
||||||
return accurate::equalsOp(first.data[first_ind], second.data[second_ind]);
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename T>
|
|
||||||
bool sliceEqualElements(const NumericArraySlice<T> &, const GenericArraySlice &, size_t, size_t)
|
|
||||||
{
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename U>
|
|
||||||
bool sliceEqualElements(const GenericArraySlice &, const NumericArraySlice<U> &, size_t, size_t)
|
|
||||||
{
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
inline ALWAYS_INLINE bool sliceEqualElements(const GenericArraySlice & first, const GenericArraySlice & second, size_t first_ind, size_t second_ind)
|
|
||||||
{
|
|
||||||
return first.elements->compareAt(first_ind + first.begin, second_ind + second.begin, *second.elements, -1) == 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename T>
|
|
||||||
bool insliceEqualElements(const NumericArraySlice<T> & first [[maybe_unused]],
|
|
||||||
size_t first_ind [[maybe_unused]],
|
|
||||||
size_t second_ind [[maybe_unused]])
|
|
||||||
{
|
|
||||||
if constexpr (is_decimal<T>)
|
|
||||||
return accurate::equalsOp(first.data[first_ind].value, first.data[second_ind].value);
|
|
||||||
else
|
|
||||||
return accurate::equalsOp(first.data[first_ind], first.data[second_ind]);
|
|
||||||
}
|
|
||||||
|
|
||||||
inline ALWAYS_INLINE bool insliceEqualElements(const GenericArraySlice & first, size_t first_ind, size_t second_ind)
|
|
||||||
{
|
|
||||||
return first.elements->compareAt(first_ind + first.begin, second_ind + first.begin, *first.elements, -1) == 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
template <ArraySearchType search_type, typename T, typename U>
|
template <ArraySearchType search_type, typename T, typename U>
|
||||||
bool sliceHas(const NumericArraySlice<T> & first, const NumericArraySlice<U> & second)
|
bool sliceHas(const NumericArraySlice<T> & first, const NumericArraySlice<U> & second)
|
||||||
{
|
{
|
||||||
@ -854,4 +787,3 @@ void resizeConstantSize(ArraySource && array_source, ValueSource && value_source
|
|||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
include("${ClickHouse_SOURCE_DIR}/cmake/dbms_glob_sources.cmake")
|
include("${ClickHouse_SOURCE_DIR}/cmake/dbms_glob_sources.cmake")
|
||||||
|
|
||||||
add_headers_and_sources(clickhouse_functions_gatherutils .)
|
add_headers_and_sources(clickhouse_functions_gatherutils .)
|
||||||
add_library(clickhouse_functions_gatherutils ${clickhouse_functions_gatherutils_sources} ${clickhouse_functions_gatherutils_headers})
|
add_library(clickhouse_functions_gatherutils ${clickhouse_functions_gatherutils_sources} ${clickhouse_functions_gatherutils_headers})
|
||||||
target_link_libraries(clickhouse_functions_gatherutils PRIVATE dbms)
|
target_link_libraries(clickhouse_functions_gatherutils PRIVATE dbms)
|
||||||
@ -14,3 +15,5 @@ endif()
|
|||||||
if (STRIP_DEBUG_SYMBOLS_FUNCTIONS)
|
if (STRIP_DEBUG_SYMBOLS_FUNCTIONS)
|
||||||
target_compile_options(clickhouse_functions_gatherutils PRIVATE "-g0")
|
target_compile_options(clickhouse_functions_gatherutils PRIVATE "-g0")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
set_target_properties(clickhouse_functions_gatherutils PROPERTIES COMPILE_FLAGS "${X86_INTRINSICS_FLAGS}")
|
||||||
|
41
src/Functions/GatherUtils/sliceEqualElements.h
Normal file
41
src/Functions/GatherUtils/sliceEqualElements.h
Normal file
@ -0,0 +1,41 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <Core/AccurateComparison.h>
|
||||||
|
#include "Slices.h"
|
||||||
|
|
||||||
|
namespace DB::GatherUtils
|
||||||
|
{
|
||||||
|
|
||||||
|
template <typename T, typename U>
|
||||||
|
bool sliceEqualElements(const NumericArraySlice<T> & first [[maybe_unused]],
|
||||||
|
const NumericArraySlice<U> & second [[maybe_unused]],
|
||||||
|
size_t first_ind [[maybe_unused]],
|
||||||
|
size_t second_ind [[maybe_unused]])
|
||||||
|
{
|
||||||
|
/// TODO: Decimal scale
|
||||||
|
if constexpr (is_decimal<T> && is_decimal<U>)
|
||||||
|
return accurate::equalsOp(first.data[first_ind].value, second.data[second_ind].value);
|
||||||
|
else if constexpr (is_decimal<T> || is_decimal<U>)
|
||||||
|
return false;
|
||||||
|
else
|
||||||
|
return accurate::equalsOp(first.data[first_ind], second.data[second_ind]);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
bool sliceEqualElements(const NumericArraySlice<T> &, const GenericArraySlice &, size_t, size_t)
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename U>
|
||||||
|
bool sliceEqualElements(const GenericArraySlice &, const NumericArraySlice<U> &, size_t, size_t)
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline ALWAYS_INLINE bool sliceEqualElements(const GenericArraySlice & first, const GenericArraySlice & second, size_t first_ind, size_t second_ind)
|
||||||
|
{
|
||||||
|
return first.elements->compareAt(first_ind + first.begin, second_ind + second.begin, *second.elements, -1) == 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
943
src/Functions/GatherUtils/sliceHasImplAnyAll.h
Normal file
943
src/Functions/GatherUtils/sliceHasImplAnyAll.h
Normal file
@ -0,0 +1,943 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "GatherUtils.h"
|
||||||
|
#include "Slices.h"
|
||||||
|
#include "sliceEqualElements.h"
|
||||||
|
|
||||||
|
#if defined(__SSE4_2__)
|
||||||
|
#include <emmintrin.h>
|
||||||
|
#include <smmintrin.h>
|
||||||
|
#include <nmmintrin.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(__AVX2__)
|
||||||
|
#include <immintrin.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include <Common/TargetSpecific.h>
|
||||||
|
|
||||||
|
|
||||||
|
namespace DB::GatherUtils
|
||||||
|
{
|
||||||
|
|
||||||
|
inline ALWAYS_INLINE bool hasNull(const UInt8 * null_map, size_t null_map_size)
|
||||||
|
{
|
||||||
|
if (null_map == nullptr)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
for (size_t i = 0; i < null_map_size; ++i)
|
||||||
|
{
|
||||||
|
if (null_map[i])
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class T>
|
||||||
|
inline ALWAYS_INLINE bool hasAllIntegralLoopRemainder(
|
||||||
|
size_t j, const NumericArraySlice<T> & first, const NumericArraySlice<T> & second, const UInt8 * first_null_map, const UInt8 * second_null_map)
|
||||||
|
{
|
||||||
|
const bool has_first_null_map = first_null_map != nullptr;
|
||||||
|
const bool has_second_null_map = second_null_map != nullptr;
|
||||||
|
|
||||||
|
for (; j < second.size; ++j)
|
||||||
|
{
|
||||||
|
// skip null elements since both have at least one - assuming it was checked earlier that at least one element in 'first' is null
|
||||||
|
if (has_second_null_map && second_null_map[j])
|
||||||
|
continue;
|
||||||
|
|
||||||
|
bool found = false;
|
||||||
|
|
||||||
|
for (size_t i = 0; i < first.size; ++i)
|
||||||
|
{
|
||||||
|
if (has_first_null_map && first_null_map[i])
|
||||||
|
continue;
|
||||||
|
|
||||||
|
if (first.data[i] == second.data[j])
|
||||||
|
{
|
||||||
|
found = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!found)
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#if defined(__AVX2__)
|
||||||
|
|
||||||
|
DECLARE_AVX2_SPECIFIC_CODE (
|
||||||
|
|
||||||
|
// AVX2 Int64, UInt64 specialization
|
||||||
|
template<typename IntType>
|
||||||
|
requires (std::is_same_v<IntType, Int64> || std::is_same_v<IntType, UInt64>)
|
||||||
|
inline ALWAYS_INLINE bool sliceHasImplAnyAllImplInt64(
|
||||||
|
const NumericArraySlice<IntType> & first,
|
||||||
|
const NumericArraySlice<IntType> & second,
|
||||||
|
const UInt8 * first_null_map,
|
||||||
|
const UInt8 * second_null_map)
|
||||||
|
{
|
||||||
|
if (second.size == 0)
|
||||||
|
return true;
|
||||||
|
|
||||||
|
if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
const bool has_first_null_map = first_null_map != nullptr;
|
||||||
|
const bool has_second_null_map = second_null_map != nullptr;
|
||||||
|
|
||||||
|
size_t j = 0;
|
||||||
|
int has_mask = 1;
|
||||||
|
static constexpr Int64 full = -1, none = 0;
|
||||||
|
const __m256i ones = _mm256_set1_epi64x(full);
|
||||||
|
const __m256i zeros = _mm256_setzero_si256();
|
||||||
|
|
||||||
|
if (second.size > 3 && first.size > 3)
|
||||||
|
{
|
||||||
|
for (; j < second.size - 3 && has_mask; j += 4)
|
||||||
|
{
|
||||||
|
has_mask = 0;
|
||||||
|
const __m256i second_data = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(second.data + j));
|
||||||
|
// bits of the bitmask are set to one if considered as null in the corresponding null map, 0 otherwise;
|
||||||
|
__m256i bitmask = has_second_null_map ?
|
||||||
|
_mm256_set_epi64x(
|
||||||
|
(second_null_map[j + 3])? full : none,
|
||||||
|
(second_null_map[j + 2])? full : none,
|
||||||
|
(second_null_map[j + 1])? full : none,
|
||||||
|
(second_null_map[j]) ? full : none)
|
||||||
|
: zeros;
|
||||||
|
|
||||||
|
size_t i = 0;
|
||||||
|
for (; i < first.size - 3 && !has_mask; has_mask = _mm256_testc_si256(bitmask, ones), i += 4)
|
||||||
|
{
|
||||||
|
const __m256i first_data = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(first.data + i));
|
||||||
|
const __m256i first_nm_mask = has_first_null_map?
|
||||||
|
_mm256_set_m128i(
|
||||||
|
_mm_cvtepi8_epi64(_mm_loadu_si128(reinterpret_cast<const __m128i *>(first_null_map + i + 2))),
|
||||||
|
_mm_cvtepi8_epi64(_mm_loadu_si128(reinterpret_cast<const __m128i *>(first_null_map + i))))
|
||||||
|
: zeros;
|
||||||
|
bitmask =
|
||||||
|
_mm256_or_si256(
|
||||||
|
_mm256_or_si256(
|
||||||
|
_mm256_or_si256(
|
||||||
|
_mm256_andnot_si256(
|
||||||
|
first_nm_mask,
|
||||||
|
_mm256_cmpeq_epi64(second_data, first_data)),
|
||||||
|
_mm256_andnot_si256(
|
||||||
|
_mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(5,4,3,2,1,0,7,6)),
|
||||||
|
_mm256_cmpeq_epi64(second_data, _mm256_permutevar8x32_epi32(first_data, _mm256_set_epi32(5,4,3,2,1,0,7,6))))),
|
||||||
|
|
||||||
|
_mm256_or_si256(
|
||||||
|
_mm256_andnot_si256(
|
||||||
|
_mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(3,2,1,0,7,6,5,4)),
|
||||||
|
_mm256_cmpeq_epi64(second_data, _mm256_permutevar8x32_epi32(first_data, _mm256_set_epi32(3,2,1,0,7,6,5,4)))),
|
||||||
|
_mm256_andnot_si256(
|
||||||
|
_mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(1,0,7,6,5,4,3,2)),
|
||||||
|
_mm256_cmpeq_epi64(second_data, _mm256_permutevar8x32_epi32(first_data, _mm256_set_epi32(1,0,7,6,5,4,3,2)))))),
|
||||||
|
bitmask);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (i < first.size)
|
||||||
|
{
|
||||||
|
for (; i < first.size && !has_mask; ++i)
|
||||||
|
{
|
||||||
|
if (has_first_null_map && first_null_map[i])
|
||||||
|
continue;
|
||||||
|
|
||||||
|
__m256i v_i = _mm256_set1_epi64x(first.data[i]);
|
||||||
|
bitmask = _mm256_or_si256(bitmask, _mm256_cmpeq_epi64(second_data, v_i));
|
||||||
|
has_mask = _mm256_testc_si256(bitmask, ones);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!has_mask && second.size > 3)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map);
|
||||||
|
}
|
||||||
|
|
||||||
|
// AVX2 Int32, UInt32 specialization
|
||||||
|
template<typename IntType>
|
||||||
|
requires (std::is_same_v<IntType, Int32> || std::is_same_v<IntType, UInt32>)
|
||||||
|
inline ALWAYS_INLINE bool sliceHasImplAnyAllImplInt32(
|
||||||
|
const NumericArraySlice<IntType> & first,
|
||||||
|
const NumericArraySlice<IntType> & second,
|
||||||
|
const UInt8 * first_null_map,
|
||||||
|
const UInt8 * second_null_map)
|
||||||
|
{
|
||||||
|
if (second.size == 0)
|
||||||
|
return true;
|
||||||
|
|
||||||
|
if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
const bool has_first_null_map = first_null_map != nullptr;
|
||||||
|
const bool has_second_null_map = second_null_map != nullptr;
|
||||||
|
|
||||||
|
size_t j = 0;
|
||||||
|
int has_mask = 1;
|
||||||
|
static constexpr int full = -1, none = 0;
|
||||||
|
|
||||||
|
const __m256i ones = _mm256_set1_epi32(full);
|
||||||
|
const __m256i zeros = _mm256_setzero_si256();
|
||||||
|
|
||||||
|
if (second.size > 7 && first.size > 7)
|
||||||
|
{
|
||||||
|
for (; j < second.size - 7 && has_mask; j += 8)
|
||||||
|
{
|
||||||
|
has_mask = 0;
|
||||||
|
const __m256i second_data = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(second.data + j));
|
||||||
|
// bits of the bitmask are set to one if considered as null in the corresponding null map, 0 otherwise;
|
||||||
|
__m256i bitmask = has_second_null_map ?
|
||||||
|
_mm256_set_epi32(
|
||||||
|
(second_null_map[j + 7]) ? full : none,
|
||||||
|
(second_null_map[j + 6]) ? full : none,
|
||||||
|
(second_null_map[j + 5]) ? full : none,
|
||||||
|
(second_null_map[j + 4]) ? full : none,
|
||||||
|
(second_null_map[j + 3]) ? full : none,
|
||||||
|
(second_null_map[j + 2]) ? full : none,
|
||||||
|
(second_null_map[j + 1]) ? full : none,
|
||||||
|
(second_null_map[j]) ? full : none)
|
||||||
|
: zeros;
|
||||||
|
|
||||||
|
size_t i = 0;
|
||||||
|
for (; i < first.size - 7 && !has_mask; has_mask = _mm256_testc_si256(bitmask, ones), i += 8)
|
||||||
|
{
|
||||||
|
const __m256i first_data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(first.data + i));
|
||||||
|
// Create a mask to avoid to compare null elements
|
||||||
|
// set_m128i takes two arguments: (high segment, low segment) that are two __m128i convert from 8bits to 32bits to match with next operations
|
||||||
|
const __m256i first_nm_mask = has_first_null_map?
|
||||||
|
_mm256_set_m128i(
|
||||||
|
_mm_cvtepi8_epi32(_mm_loadu_si128(reinterpret_cast<const __m128i *>(first_null_map + i + 4))),
|
||||||
|
_mm_cvtepi8_epi32(_mm_loadu_si128(reinterpret_cast<const __m128i *>(first_null_map + i))))
|
||||||
|
: zeros;
|
||||||
|
bitmask =
|
||||||
|
_mm256_or_si256(
|
||||||
|
_mm256_or_si256(
|
||||||
|
_mm256_or_si256(
|
||||||
|
_mm256_or_si256(
|
||||||
|
_mm256_andnot_si256(
|
||||||
|
first_nm_mask,
|
||||||
|
_mm256_cmpeq_epi32(second_data, first_data)),
|
||||||
|
_mm256_andnot_si256(
|
||||||
|
_mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(6,5,4,3,2,1,0,7)),
|
||||||
|
_mm256_cmpeq_epi32(second_data, _mm256_permutevar8x32_epi32(first_data, _mm256_set_epi32(6,5,4,3,2,1,0,7))))),
|
||||||
|
_mm256_or_si256(
|
||||||
|
_mm256_andnot_si256(
|
||||||
|
_mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(5,4,3,2,1,0,7,6)),
|
||||||
|
_mm256_cmpeq_epi32(second_data, _mm256_permutevar8x32_epi32(first_data, _mm256_set_epi32(5,4,3,2,1,0,7,6)))),
|
||||||
|
_mm256_andnot_si256(
|
||||||
|
_mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(4,3,2,1,0,7,6,5)),
|
||||||
|
_mm256_cmpeq_epi32(second_data, _mm256_permutevar8x32_epi32(first_data, _mm256_set_epi32(4,3,2,1,0,7,6,5)))))
|
||||||
|
),
|
||||||
|
_mm256_or_si256(
|
||||||
|
_mm256_or_si256(
|
||||||
|
_mm256_andnot_si256(
|
||||||
|
_mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(3,2,1,0,7,6,5,4)),
|
||||||
|
_mm256_cmpeq_epi32(second_data, _mm256_permutevar8x32_epi32(first_data, _mm256_set_epi32(3,2,1,0,7,6,5,4)))),
|
||||||
|
_mm256_andnot_si256(
|
||||||
|
_mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(2,1,0,7,6,5,4,3)),
|
||||||
|
_mm256_cmpeq_epi32(second_data, _mm256_permutevar8x32_epi32(first_data, _mm256_set_epi32(2,1,0,7,6,5,4,3))))),
|
||||||
|
_mm256_or_si256(
|
||||||
|
_mm256_andnot_si256(
|
||||||
|
_mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(1,0,7,6,5,4,3,2)),
|
||||||
|
_mm256_cmpeq_epi32(second_data, _mm256_permutevar8x32_epi32(first_data, _mm256_set_epi32(1,0,7,6,5,4,3,2)))),
|
||||||
|
_mm256_andnot_si256(
|
||||||
|
_mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(0,7,6,5,4,3,2,1)),
|
||||||
|
_mm256_cmpeq_epi32(second_data, _mm256_permutevar8x32_epi32(first_data, _mm256_set_epi32(0,7,6,5,4,3,2,1))))))),
|
||||||
|
bitmask);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (i < first.size)
|
||||||
|
{
|
||||||
|
for (; i < first.size && !has_mask; ++i)
|
||||||
|
{
|
||||||
|
if (has_first_null_map && first_null_map[i])
|
||||||
|
continue;
|
||||||
|
|
||||||
|
__m256i v_i = _mm256_set1_epi32(first.data[i]);
|
||||||
|
bitmask = _mm256_or_si256(bitmask, _mm256_cmpeq_epi32(second_data, v_i));
|
||||||
|
has_mask = _mm256_testc_si256(bitmask, ones);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!has_mask && second.size > 7)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map);
|
||||||
|
}
|
||||||
|
|
||||||
|
// AVX2 Int16, UInt16 specialization
|
||||||
|
template<typename IntType>
|
||||||
|
requires (std::is_same_v<IntType, Int16> || std::is_same_v<IntType, UInt16>)
|
||||||
|
inline ALWAYS_INLINE bool sliceHasImplAnyAllImplInt16(
|
||||||
|
const NumericArraySlice<IntType> & first,
|
||||||
|
const NumericArraySlice<IntType> & second,
|
||||||
|
const UInt8 * first_null_map,
|
||||||
|
const UInt8 * second_null_map)
|
||||||
|
{
|
||||||
|
if (second.size == 0)
|
||||||
|
return true;
|
||||||
|
|
||||||
|
if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
const bool has_first_null_map = first_null_map != nullptr;
|
||||||
|
const bool has_second_null_map = second_null_map != nullptr;
|
||||||
|
|
||||||
|
size_t j = 0;
|
||||||
|
int has_mask = 1;
|
||||||
|
static constexpr int16_t full = -1, none = 0;
|
||||||
|
const __m256i ones = _mm256_set1_epi16(full);
|
||||||
|
const __m256i zeros = _mm256_setzero_si256();
|
||||||
|
if (second.size > 15 && first.size > 15)
|
||||||
|
{
|
||||||
|
for (; j < second.size - 15 && has_mask; j += 16)
|
||||||
|
{
|
||||||
|
has_mask = 0;
|
||||||
|
const __m256i second_data = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(second.data + j));
|
||||||
|
__m256i bitmask = has_second_null_map ?
|
||||||
|
_mm256_set_epi16(
|
||||||
|
(second_null_map[j + 15]) ? full : none, (second_null_map[j + 14]) ? full : none,
|
||||||
|
(second_null_map[j + 13]) ? full : none, (second_null_map[j + 12]) ? full : none,
|
||||||
|
(second_null_map[j + 11]) ? full : none, (second_null_map[j + 10]) ? full : none,
|
||||||
|
(second_null_map[j + 9]) ? full : none, (second_null_map[j + 8])? full : none,
|
||||||
|
(second_null_map[j + 7]) ? full : none, (second_null_map[j + 6])? full : none,
|
||||||
|
(second_null_map[j + 5]) ? full : none, (second_null_map[j + 4])? full : none,
|
||||||
|
(second_null_map[j + 3]) ? full : none, (second_null_map[j + 2])? full : none,
|
||||||
|
(second_null_map[j + 1]) ? full : none, (second_null_map[j]) ? full : none)
|
||||||
|
: zeros;
|
||||||
|
|
||||||
|
size_t i = 0;
|
||||||
|
for (; i < first.size - 15 && !has_mask; has_mask = _mm256_testc_si256(bitmask, ones), i += 16)
|
||||||
|
{
|
||||||
|
const __m256i first_data = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(first.data + i));
|
||||||
|
const __m256i first_nm_mask = has_first_null_map?
|
||||||
|
_mm256_set_m128i(
|
||||||
|
_mm_cvtepi8_epi16(_mm_loadu_si128(reinterpret_cast<const __m128i *>(first_null_map + i + 8))),
|
||||||
|
_mm_cvtepi8_epi16(_mm_loadu_si128(reinterpret_cast<const __m128i *>(first_null_map + i))))
|
||||||
|
: zeros;
|
||||||
|
|
||||||
|
bitmask =
|
||||||
|
_mm256_or_si256(
|
||||||
|
_mm256_or_si256(
|
||||||
|
_mm256_or_si256(
|
||||||
|
_mm256_or_si256(
|
||||||
|
_mm256_or_si256(
|
||||||
|
_mm256_andnot_si256(
|
||||||
|
first_nm_mask,
|
||||||
|
_mm256_cmpeq_epi16(second_data, first_data)),
|
||||||
|
_mm256_andnot_si256(
|
||||||
|
_mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30)),
|
||||||
|
_mm256_cmpeq_epi16(second_data, _mm256_shuffle_epi8(first_data, _mm256_set_epi8(29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30))))),
|
||||||
|
_mm256_or_si256(
|
||||||
|
_mm256_andnot_si256(
|
||||||
|
_mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28)),
|
||||||
|
_mm256_cmpeq_epi16(second_data, _mm256_shuffle_epi8(first_data, _mm256_set_epi8(27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28)))),
|
||||||
|
_mm256_andnot_si256(
|
||||||
|
_mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26)),
|
||||||
|
_mm256_cmpeq_epi16(second_data, _mm256_shuffle_epi8(first_data, _mm256_set_epi8(25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26)))))
|
||||||
|
),
|
||||||
|
_mm256_or_si256(
|
||||||
|
_mm256_or_si256(
|
||||||
|
_mm256_andnot_si256(
|
||||||
|
_mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24)),
|
||||||
|
_mm256_cmpeq_epi16(second_data, _mm256_shuffle_epi8(first_data, _mm256_set_epi8(23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24)))),
|
||||||
|
_mm256_andnot_si256(
|
||||||
|
_mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22)),
|
||||||
|
_mm256_cmpeq_epi16(second_data, _mm256_shuffle_epi8(first_data, _mm256_set_epi8(21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22))))),
|
||||||
|
_mm256_or_si256(
|
||||||
|
_mm256_andnot_si256(
|
||||||
|
_mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20)),
|
||||||
|
_mm256_cmpeq_epi16(second_data, _mm256_shuffle_epi8(first_data, _mm256_set_epi8(19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20)))),
|
||||||
|
_mm256_andnot_si256(
|
||||||
|
_mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18)),
|
||||||
|
_mm256_cmpeq_epi16(second_data, _mm256_shuffle_epi8(first_data, _mm256_set_epi8(17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18))))))
|
||||||
|
),
|
||||||
|
_mm256_or_si256(
|
||||||
|
_mm256_or_si256(
|
||||||
|
_mm256_or_si256(
|
||||||
|
_mm256_andnot_si256(
|
||||||
|
_mm256_permute2x128_si256(first_nm_mask, first_nm_mask,1),
|
||||||
|
_mm256_cmpeq_epi16(second_data, _mm256_permute2x128_si256(first_data, first_data, 1))),
|
||||||
|
_mm256_andnot_si256(
|
||||||
|
_mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14)),
|
||||||
|
_mm256_cmpeq_epi16(second_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_data, first_data, 1), _mm256_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14))))),
|
||||||
|
_mm256_or_si256(
|
||||||
|
_mm256_andnot_si256(
|
||||||
|
_mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12)),
|
||||||
|
_mm256_cmpeq_epi16(second_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_data, first_data, 1), _mm256_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12)))),
|
||||||
|
_mm256_andnot_si256(
|
||||||
|
_mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10)),
|
||||||
|
_mm256_cmpeq_epi16(second_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_data, first_data, 1), _mm256_set_epi8(9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10)))))
|
||||||
|
),
|
||||||
|
_mm256_or_si256(
|
||||||
|
_mm256_or_si256(
|
||||||
|
_mm256_andnot_si256(
|
||||||
|
_mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8)),
|
||||||
|
_mm256_cmpeq_epi16(second_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_data ,first_data, 1), _mm256_set_epi8(7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8)))),
|
||||||
|
_mm256_andnot_si256(
|
||||||
|
_mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6)),
|
||||||
|
_mm256_cmpeq_epi16(second_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_data, first_data, 1), _mm256_set_epi8(5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6))))),
|
||||||
|
_mm256_or_si256(
|
||||||
|
_mm256_andnot_si256(
|
||||||
|
_mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4)),
|
||||||
|
_mm256_cmpeq_epi16(second_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_data ,first_data ,1), _mm256_set_epi8(3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4)))),
|
||||||
|
_mm256_andnot_si256(
|
||||||
|
_mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2)),
|
||||||
|
_mm256_cmpeq_epi16(second_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_data, first_data, 1), _mm256_set_epi8(1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2))))))
|
||||||
|
)
|
||||||
|
),
|
||||||
|
bitmask);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (i < first.size)
|
||||||
|
{
|
||||||
|
for (; i < first.size && !has_mask; ++i)
|
||||||
|
{
|
||||||
|
if (has_first_null_map && first_null_map[i])
|
||||||
|
continue;
|
||||||
|
|
||||||
|
__m256i v_i = _mm256_set1_epi16(first.data[i]);
|
||||||
|
bitmask = _mm256_or_si256(bitmask, _mm256_cmpeq_epi16(second_data, v_i));
|
||||||
|
has_mask = _mm256_testc_si256(bitmask, ones);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!has_mask && second.size > 15)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map);
|
||||||
|
}
|
||||||
|
|
||||||
|
)
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(__SSE4_2__)
|
||||||
|
|
||||||
|
DECLARE_SSE42_SPECIFIC_CODE (
|
||||||
|
|
||||||
|
// SSE4.2 Int64, UInt64 specialization
|
||||||
|
template<typename IntType>
|
||||||
|
requires (std::is_same_v<IntType, Int64> || std::is_same_v<IntType, UInt64>)
|
||||||
|
inline ALWAYS_INLINE bool sliceHasImplAnyAllImplInt64(
|
||||||
|
const NumericArraySlice<IntType> & first,
|
||||||
|
const NumericArraySlice<IntType> & second,
|
||||||
|
const UInt8 * first_null_map,
|
||||||
|
const UInt8 * second_null_map)
|
||||||
|
{
|
||||||
|
if (second.size == 0)
|
||||||
|
return true;
|
||||||
|
|
||||||
|
if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
const bool has_first_null_map = first_null_map != nullptr;
|
||||||
|
const bool has_second_null_map = second_null_map != nullptr;
|
||||||
|
|
||||||
|
size_t j = 0;
|
||||||
|
int has_mask = 1;
|
||||||
|
static constexpr Int64 full = -1, none = 0;
|
||||||
|
const __m128i zeros = _mm_setzero_si128();
|
||||||
|
if (second.size > 1 && first.size > 1)
|
||||||
|
{
|
||||||
|
for (; j < second.size - 1 && has_mask; j += 2)
|
||||||
|
{
|
||||||
|
has_mask = 0;
|
||||||
|
const __m128i second_data = _mm_loadu_si128(reinterpret_cast<const __m128i *>(second.data + j));
|
||||||
|
__m128i bitmask = has_second_null_map ?
|
||||||
|
_mm_set_epi64x(
|
||||||
|
(second_null_map[j + 1]) ? full : none,
|
||||||
|
(second_null_map[j]) ? full : none)
|
||||||
|
: zeros;
|
||||||
|
|
||||||
|
size_t i = 0;
|
||||||
|
|
||||||
|
for (; i < first.size - 1 && !has_mask; has_mask = _mm_test_all_ones(bitmask), i += 2)
|
||||||
|
{
|
||||||
|
const __m128i first_data = _mm_loadu_si128(reinterpret_cast<const __m128i *>(first.data + i));
|
||||||
|
const __m128i first_nm_mask = has_first_null_map ?
|
||||||
|
_mm_cvtepi8_epi64(_mm_loadu_si128(reinterpret_cast<const __m128i *>(first_null_map + i)))
|
||||||
|
: zeros;
|
||||||
|
|
||||||
|
bitmask =
|
||||||
|
_mm_or_si128(
|
||||||
|
_mm_or_si128(
|
||||||
|
_mm_andnot_si128(
|
||||||
|
first_nm_mask,
|
||||||
|
_mm_cmpeq_epi64(second_data, first_data)),
|
||||||
|
_mm_andnot_si128(
|
||||||
|
_mm_shuffle_epi32(first_nm_mask, _MM_SHUFFLE(1,0,3,2)),
|
||||||
|
_mm_cmpeq_epi64(second_data, _mm_shuffle_epi32(first_data, _MM_SHUFFLE(1,0,3,2))))),
|
||||||
|
bitmask);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (i < first.size)
|
||||||
|
{
|
||||||
|
for (; i < first.size && !has_mask; ++i)
|
||||||
|
{
|
||||||
|
if (has_first_null_map && first_null_map[i])
|
||||||
|
continue;
|
||||||
|
|
||||||
|
__m128i v_i = _mm_set1_epi64x(first.data[i]);
|
||||||
|
bitmask = _mm_or_si128(bitmask, _mm_cmpeq_epi64(second_data, v_i));
|
||||||
|
has_mask = _mm_test_all_ones(bitmask);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!has_mask && second.size > 1)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map);
|
||||||
|
}
|
||||||
|
|
||||||
|
// SSE4.2 Int32, UInt32 specialization
|
||||||
|
template<typename IntType>
|
||||||
|
requires (std::is_same_v<IntType, Int32> || std::is_same_v<IntType, UInt32>)
|
||||||
|
inline ALWAYS_INLINE bool sliceHasImplAnyAllImplInt32(
|
||||||
|
const NumericArraySlice<IntType> & first,
|
||||||
|
const NumericArraySlice<IntType> & second,
|
||||||
|
const UInt8 * first_null_map,
|
||||||
|
const UInt8 * second_null_map)
|
||||||
|
{
|
||||||
|
if (second.size == 0)
|
||||||
|
return true;
|
||||||
|
|
||||||
|
if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
const bool has_first_null_map = first_null_map != nullptr;
|
||||||
|
const bool has_second_null_map = second_null_map != nullptr;
|
||||||
|
|
||||||
|
size_t j = 0;
|
||||||
|
int has_mask = 1;
|
||||||
|
static constexpr int full = -1, none = 0;
|
||||||
|
const __m128i zeros = _mm_setzero_si128();
|
||||||
|
if (second.size > 3 && first.size > 3)
|
||||||
|
{
|
||||||
|
for (; j < second.size - 3 && has_mask; j += 4)
|
||||||
|
{
|
||||||
|
has_mask = 0;
|
||||||
|
const __m128i second_data = _mm_loadu_si128(reinterpret_cast<const __m128i *>(second.data + j));
|
||||||
|
__m128i bitmask = has_second_null_map ?
|
||||||
|
_mm_set_epi32(
|
||||||
|
(second_null_map[j + 3]) ? full : none,
|
||||||
|
(second_null_map[j + 2]) ? full : none,
|
||||||
|
(second_null_map[j + 1]) ? full : none,
|
||||||
|
(second_null_map[j]) ? full : none)
|
||||||
|
: zeros;
|
||||||
|
|
||||||
|
size_t i = 0;
|
||||||
|
for (; i < first.size - 3 && !has_mask; has_mask = _mm_test_all_ones(bitmask), i += 4)
|
||||||
|
{
|
||||||
|
const __m128i first_data = _mm_loadu_si128(reinterpret_cast<const __m128i *>(first.data + i));
|
||||||
|
const __m128i first_nm_mask = has_first_null_map ?
|
||||||
|
_mm_cvtepi8_epi32(_mm_loadu_si128(reinterpret_cast<const __m128i *>(first_null_map + i)))
|
||||||
|
: zeros;
|
||||||
|
|
||||||
|
bitmask =
|
||||||
|
_mm_or_si128(
|
||||||
|
_mm_or_si128(
|
||||||
|
_mm_or_si128(
|
||||||
|
_mm_andnot_si128(
|
||||||
|
first_nm_mask,
|
||||||
|
_mm_cmpeq_epi32(second_data, first_data)),
|
||||||
|
_mm_andnot_si128(
|
||||||
|
_mm_shuffle_epi32(first_nm_mask, _MM_SHUFFLE(2,1,0,3)),
|
||||||
|
_mm_cmpeq_epi32(second_data, _mm_shuffle_epi32(first_data, _MM_SHUFFLE(2,1,0,3))))),
|
||||||
|
_mm_or_si128(
|
||||||
|
_mm_andnot_si128(
|
||||||
|
_mm_shuffle_epi32(first_nm_mask, _MM_SHUFFLE(1,0,3,2)),
|
||||||
|
_mm_cmpeq_epi32(second_data, _mm_shuffle_epi32(first_data, _MM_SHUFFLE(1,0,3,2)))),
|
||||||
|
_mm_andnot_si128(
|
||||||
|
_mm_shuffle_epi32(first_nm_mask, _MM_SHUFFLE(0,3,2,1)),
|
||||||
|
_mm_cmpeq_epi32(second_data, _mm_shuffle_epi32(first_data, _MM_SHUFFLE(0,3,2,1)))))
|
||||||
|
),
|
||||||
|
bitmask);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (i < first.size)
|
||||||
|
{
|
||||||
|
for (; i < first.size && !has_mask; ++i)
|
||||||
|
{
|
||||||
|
if (has_first_null_map && first_null_map[i])
|
||||||
|
continue;
|
||||||
|
__m128i r_i = _mm_set1_epi32(first.data[i]);
|
||||||
|
bitmask = _mm_or_si128(bitmask, _mm_cmpeq_epi32(second_data, r_i));
|
||||||
|
has_mask = _mm_test_all_ones(bitmask);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!has_mask && second.size > 3)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map);
|
||||||
|
}
|
||||||
|
|
||||||
|
// SSE4.2 Int16, UInt16 specialization
|
||||||
|
template<typename IntType>
|
||||||
|
requires (std::is_same_v<IntType, Int16> || std::is_same_v<IntType, UInt16>)
|
||||||
|
inline ALWAYS_INLINE bool sliceHasImplAnyAllImplInt16(
|
||||||
|
const NumericArraySlice<IntType> & first,
|
||||||
|
const NumericArraySlice<IntType> & second,
|
||||||
|
const UInt8 * first_null_map,
|
||||||
|
const UInt8 * second_null_map)
|
||||||
|
{
|
||||||
|
if (second.size == 0)
|
||||||
|
return true;
|
||||||
|
|
||||||
|
if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
const bool has_first_null_map = first_null_map != nullptr;
|
||||||
|
const bool has_second_null_map = second_null_map != nullptr;
|
||||||
|
|
||||||
|
size_t j = 0;
|
||||||
|
int has_mask = 1;
|
||||||
|
static constexpr int16_t full = -1, none = 0;
|
||||||
|
const __m128i zeros = _mm_setzero_si128();
|
||||||
|
if (second.size > 6 && first.size > 6)
|
||||||
|
{
|
||||||
|
for (; j < second.size - 7 && has_mask; j += 8)
|
||||||
|
{
|
||||||
|
has_mask = 0;
|
||||||
|
const __m128i second_data = _mm_loadu_si128(reinterpret_cast<const __m128i *>(second.data + j));
|
||||||
|
__m128i bitmask = has_second_null_map ?
|
||||||
|
_mm_set_epi16(
|
||||||
|
(second_null_map[j + 7]) ? full : none, (second_null_map[j + 6]) ? full : none,
|
||||||
|
(second_null_map[j + 5]) ? full : none, (second_null_map[j + 4]) ? full : none,
|
||||||
|
(second_null_map[j + 3]) ? full : none, (second_null_map[j + 2]) ? full : none,
|
||||||
|
(second_null_map[j + 1]) ? full : none, (second_null_map[j]) ? full: none)
|
||||||
|
: zeros;
|
||||||
|
|
||||||
|
size_t i = 0;
|
||||||
|
for (; i < first.size-7 && !has_mask; has_mask = _mm_test_all_ones(bitmask), i += 8)
|
||||||
|
{
|
||||||
|
const __m128i first_data = _mm_loadu_si128(reinterpret_cast<const __m128i *>(first.data + i));
|
||||||
|
const __m128i first_nm_mask = has_first_null_map ?
|
||||||
|
_mm_cvtepi8_epi16(_mm_loadu_si128(reinterpret_cast<const __m128i *>(first_null_map + i)))
|
||||||
|
: zeros;
|
||||||
|
bitmask =
|
||||||
|
_mm_or_si128(
|
||||||
|
_mm_or_si128(
|
||||||
|
_mm_or_si128(
|
||||||
|
_mm_or_si128(
|
||||||
|
_mm_andnot_si128(
|
||||||
|
first_nm_mask,
|
||||||
|
_mm_cmpeq_epi16(second_data, first_data)),
|
||||||
|
_mm_andnot_si128(
|
||||||
|
_mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14)),
|
||||||
|
_mm_cmpeq_epi16(second_data, _mm_shuffle_epi8(first_data, _mm_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14))))),
|
||||||
|
_mm_or_si128(
|
||||||
|
_mm_andnot_si128(
|
||||||
|
_mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12)),
|
||||||
|
_mm_cmpeq_epi16(second_data, _mm_shuffle_epi8(first_data, _mm_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12)))),
|
||||||
|
_mm_andnot_si128(
|
||||||
|
_mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10)),
|
||||||
|
_mm_cmpeq_epi16(second_data, _mm_shuffle_epi8(first_data, _mm_set_epi8(9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10)))))
|
||||||
|
),
|
||||||
|
_mm_or_si128(
|
||||||
|
_mm_or_si128(
|
||||||
|
_mm_andnot_si128(
|
||||||
|
_mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8)),
|
||||||
|
_mm_cmpeq_epi16(second_data, _mm_shuffle_epi8(first_data, _mm_set_epi8(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8)))),
|
||||||
|
_mm_andnot_si128(
|
||||||
|
_mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6)),
|
||||||
|
_mm_cmpeq_epi16(second_data, _mm_shuffle_epi8(first_data, _mm_set_epi8(5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6))))),
|
||||||
|
_mm_or_si128(
|
||||||
|
_mm_andnot_si128(
|
||||||
|
_mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4)),
|
||||||
|
_mm_cmpeq_epi16(second_data, _mm_shuffle_epi8(first_data, _mm_set_epi8(3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4)))),
|
||||||
|
_mm_andnot_si128(
|
||||||
|
_mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2)),
|
||||||
|
_mm_cmpeq_epi16(second_data, _mm_shuffle_epi8(first_data, _mm_set_epi8(1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2))))))
|
||||||
|
),
|
||||||
|
bitmask);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (i < first.size)
|
||||||
|
{
|
||||||
|
for (; i < first.size && !has_mask; ++i)
|
||||||
|
{
|
||||||
|
if (has_first_null_map && first_null_map[i])
|
||||||
|
continue;
|
||||||
|
__m128i v_i = _mm_set1_epi16(first.data[i]);
|
||||||
|
bitmask = _mm_or_si128(bitmask, _mm_cmpeq_epi16(second_data, v_i));
|
||||||
|
has_mask = _mm_test_all_ones(bitmask);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!has_mask && second.size > 6)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Int8/UInt8 version is faster with SSE than with AVX2
|
||||||
|
// SSE2 Int8, UInt8 specialization
|
||||||
|
template<typename IntType>
|
||||||
|
requires (std::is_same_v<IntType, Int8> || std::is_same_v<IntType, UInt8>)
|
||||||
|
inline ALWAYS_INLINE bool sliceHasImplAnyAllImplInt8(
|
||||||
|
const NumericArraySlice<IntType> & first,
|
||||||
|
const NumericArraySlice<IntType> & second,
|
||||||
|
const UInt8 * first_null_map,
|
||||||
|
const UInt8 * second_null_map)
|
||||||
|
{
|
||||||
|
if (second.size == 0)
|
||||||
|
return true;
|
||||||
|
|
||||||
|
if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
const bool has_first_null_map = first_null_map != nullptr;
|
||||||
|
const bool has_second_null_map = second_null_map != nullptr;
|
||||||
|
|
||||||
|
size_t j = 0;
|
||||||
|
int has_mask = 1;
|
||||||
|
static constexpr int8_t full = -1, none = 0;
|
||||||
|
const __m128i zeros = _mm_setzero_si128();
|
||||||
|
|
||||||
|
if (second.size > 15 && first.size > 15)
|
||||||
|
{
|
||||||
|
for (; j < second.size - 15 && has_mask; j += 16)
|
||||||
|
{
|
||||||
|
has_mask = 0;
|
||||||
|
const __m128i second_data = _mm_loadu_si128(reinterpret_cast<const __m128i *>(second.data + j));
|
||||||
|
__m128i bitmask = has_second_null_map ?
|
||||||
|
_mm_set_epi8(
|
||||||
|
(second_null_map[j + 15]) ? full : none, (second_null_map[j + 14]) ? full : none,
|
||||||
|
(second_null_map[j + 13]) ? full : none, (second_null_map[j + 12]) ? full : none,
|
||||||
|
(second_null_map[j + 11]) ? full : none, (second_null_map[j + 10]) ? full : none,
|
||||||
|
(second_null_map[j + 9]) ? full : none, (second_null_map[j + 8]) ? full : none,
|
||||||
|
(second_null_map[j + 7]) ? full : none, (second_null_map[j + 6]) ? full : none,
|
||||||
|
(second_null_map[j + 5]) ? full : none, (second_null_map[j + 4]) ? full : none,
|
||||||
|
(second_null_map[j + 3]) ? full : none, (second_null_map[j + 2]) ? full : none,
|
||||||
|
(second_null_map[j + 1]) ? full : none, (second_null_map[j]) ? full : none)
|
||||||
|
: zeros;
|
||||||
|
|
||||||
|
size_t i = 0;
|
||||||
|
for (; i < first.size - 15 && !has_mask; has_mask = _mm_test_all_ones(bitmask), i += 16)
|
||||||
|
{
|
||||||
|
const __m128i first_data = _mm_loadu_si128(reinterpret_cast<const __m128i *>(first.data + i));
|
||||||
|
const __m128i first_nm_mask = has_first_null_map ?
|
||||||
|
_mm_loadu_si128(reinterpret_cast<const __m128i *>(first_null_map + i))
|
||||||
|
: zeros;
|
||||||
|
bitmask =
|
||||||
|
_mm_or_si128(
|
||||||
|
_mm_or_si128(
|
||||||
|
_mm_or_si128(
|
||||||
|
_mm_or_si128(
|
||||||
|
_mm_or_si128(
|
||||||
|
_mm_andnot_si128(
|
||||||
|
first_nm_mask,
|
||||||
|
_mm_cmpeq_epi8(second_data, first_data)),
|
||||||
|
_mm_andnot_si128(
|
||||||
|
_mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15)),
|
||||||
|
_mm_cmpeq_epi8(second_data, _mm_shuffle_epi8(first_data, _mm_set_epi8(14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15))))),
|
||||||
|
_mm_or_si128(
|
||||||
|
_mm_andnot_si128(
|
||||||
|
_mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14)),
|
||||||
|
_mm_cmpeq_epi8(second_data, _mm_shuffle_epi8(first_data, _mm_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14)))),
|
||||||
|
_mm_andnot_si128(
|
||||||
|
_mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13)),
|
||||||
|
_mm_cmpeq_epi8(second_data, _mm_shuffle_epi8(first_data, _mm_set_epi8(12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13)))))
|
||||||
|
),
|
||||||
|
_mm_or_si128(
|
||||||
|
_mm_or_si128(
|
||||||
|
_mm_andnot_si128(
|
||||||
|
_mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12)),
|
||||||
|
_mm_cmpeq_epi8(second_data, _mm_shuffle_epi8(first_data, _mm_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12)))),
|
||||||
|
_mm_andnot_si128(
|
||||||
|
_mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11)),
|
||||||
|
_mm_cmpeq_epi8(second_data, _mm_shuffle_epi8(first_data, _mm_set_epi8(10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11))))),
|
||||||
|
_mm_or_si128(
|
||||||
|
_mm_andnot_si128(
|
||||||
|
_mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10)),
|
||||||
|
_mm_cmpeq_epi8(second_data, _mm_shuffle_epi8(first_data, _mm_set_epi8(9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10)))),
|
||||||
|
_mm_andnot_si128(
|
||||||
|
_mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9)),
|
||||||
|
_mm_cmpeq_epi8(second_data, _mm_shuffle_epi8(first_data, _mm_set_epi8(8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9))))))),
|
||||||
|
_mm_or_si128(
|
||||||
|
_mm_or_si128(
|
||||||
|
_mm_or_si128(
|
||||||
|
_mm_andnot_si128(
|
||||||
|
_mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8)),
|
||||||
|
_mm_cmpeq_epi8(second_data, _mm_shuffle_epi8(first_data, _mm_set_epi8(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8)))),
|
||||||
|
_mm_andnot_si128(
|
||||||
|
_mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7)),
|
||||||
|
_mm_cmpeq_epi8(second_data, _mm_shuffle_epi8(first_data, _mm_set_epi8(6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7))))),
|
||||||
|
_mm_or_si128(
|
||||||
|
_mm_andnot_si128(
|
||||||
|
_mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6)),
|
||||||
|
_mm_cmpeq_epi8(second_data, _mm_shuffle_epi8(first_data, _mm_set_epi8(5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6)))),
|
||||||
|
_mm_andnot_si128(
|
||||||
|
_mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5)),
|
||||||
|
_mm_cmpeq_epi8(second_data, _mm_shuffle_epi8(first_data, _mm_set_epi8(4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5)))))),
|
||||||
|
_mm_or_si128(
|
||||||
|
_mm_or_si128(
|
||||||
|
_mm_andnot_si128(
|
||||||
|
_mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4)),
|
||||||
|
_mm_cmpeq_epi8(second_data, _mm_shuffle_epi8(first_data, _mm_set_epi8(3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4)))),
|
||||||
|
_mm_andnot_si128(
|
||||||
|
_mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3)),
|
||||||
|
_mm_cmpeq_epi8(second_data, _mm_shuffle_epi8(first_data, _mm_set_epi8(2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3))))),
|
||||||
|
_mm_or_si128(
|
||||||
|
_mm_andnot_si128(
|
||||||
|
_mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2)),
|
||||||
|
_mm_cmpeq_epi8(second_data, _mm_shuffle_epi8(first_data, _mm_set_epi8(1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2)))),
|
||||||
|
_mm_andnot_si128(
|
||||||
|
_mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1)),
|
||||||
|
_mm_cmpeq_epi8(second_data, _mm_shuffle_epi8(first_data, _mm_set_epi8(0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1)))))))),
|
||||||
|
bitmask);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (i < first.size)
|
||||||
|
{
|
||||||
|
for (; i < first.size && !has_mask; ++i)
|
||||||
|
{
|
||||||
|
if (has_first_null_map && first_null_map[i])
|
||||||
|
continue;
|
||||||
|
|
||||||
|
__m128i v_i = _mm_set1_epi8(first.data[i]);
|
||||||
|
bitmask = _mm_or_si128(bitmask, _mm_cmpeq_epi8(second_data, v_i));
|
||||||
|
has_mask = _mm_test_all_ones(bitmask);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!has_mask && second.size > 15)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map);
|
||||||
|
}
|
||||||
|
|
||||||
|
)
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
template <
|
||||||
|
ArraySearchType search_type,
|
||||||
|
typename FirstSliceType,
|
||||||
|
typename SecondSliceType,
|
||||||
|
bool (*isEqual)(const FirstSliceType &, const SecondSliceType &, size_t, size_t)>
|
||||||
|
bool sliceHasImplAnyAllGenericImpl(const FirstSliceType & first, const SecondSliceType & second, const UInt8 * first_null_map, const UInt8 * second_null_map)
|
||||||
|
{
|
||||||
|
const bool has_first_null_map = first_null_map != nullptr;
|
||||||
|
const bool has_second_null_map = second_null_map != nullptr;
|
||||||
|
|
||||||
|
const bool has_second_null = hasNull(second_null_map, second.size);
|
||||||
|
if (has_second_null)
|
||||||
|
{
|
||||||
|
const bool has_first_null = hasNull(first_null_map, first.size);
|
||||||
|
|
||||||
|
if (has_first_null && search_type == ArraySearchType::Any)
|
||||||
|
return true;
|
||||||
|
|
||||||
|
if (!has_first_null && search_type == ArraySearchType::All)
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (size_t i = 0; i < second.size; ++i)
|
||||||
|
{
|
||||||
|
if (has_second_null_map && second_null_map[i])
|
||||||
|
continue;
|
||||||
|
|
||||||
|
bool has = false;
|
||||||
|
|
||||||
|
for (size_t j = 0; j < first.size && !has; ++j)
|
||||||
|
{
|
||||||
|
if (has_first_null_map && first_null_map[j])
|
||||||
|
continue;
|
||||||
|
|
||||||
|
if (isEqual(first, second, j, i))
|
||||||
|
{
|
||||||
|
has = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (has && search_type == ArraySearchType::Any)
|
||||||
|
return true;
|
||||||
|
|
||||||
|
if (!has && search_type == ArraySearchType::All)
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return search_type == ArraySearchType::All;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Methods to check if first array has elements from second array, overloaded for various combinations of types.
|
||||||
|
template <
|
||||||
|
ArraySearchType search_type,
|
||||||
|
typename FirstSliceType,
|
||||||
|
typename SecondSliceType,
|
||||||
|
bool (*isEqual)(const FirstSliceType &, const SecondSliceType &, size_t, size_t)>
|
||||||
|
inline ALWAYS_INLINE bool sliceHasImplAnyAll(const FirstSliceType & first, const SecondSliceType & second, const UInt8 * first_null_map, const UInt8 * second_null_map)
|
||||||
|
{
|
||||||
|
#if USE_MULTITARGET_CODE
|
||||||
|
if constexpr (search_type == ArraySearchType::All && std::is_same_v<FirstSliceType, SecondSliceType>)
|
||||||
|
{
|
||||||
|
|
||||||
|
#if defined(__AVX2__)
|
||||||
|
if (isArchSupported(TargetArch::AVX2))
|
||||||
|
{
|
||||||
|
if constexpr (std::is_same_v<FirstSliceType, NumericArraySlice<Int16>> || std::is_same_v<FirstSliceType, NumericArraySlice<UInt16>>)
|
||||||
|
{
|
||||||
|
return GatherUtils::TargetSpecific::AVX2::sliceHasImplAnyAllImplInt16(first, second, first_null_map, second_null_map);
|
||||||
|
}
|
||||||
|
else if constexpr (std::is_same_v<FirstSliceType, NumericArraySlice<Int32>> || std::is_same_v<FirstSliceType, NumericArraySlice<UInt32>>)
|
||||||
|
{
|
||||||
|
return GatherUtils::TargetSpecific::AVX2::sliceHasImplAnyAllImplInt32(first, second, first_null_map, second_null_map);
|
||||||
|
}
|
||||||
|
else if constexpr (std::is_same_v<FirstSliceType, NumericArraySlice<Int64>> || std::is_same_v<FirstSliceType, NumericArraySlice<UInt64>>)
|
||||||
|
{
|
||||||
|
return GatherUtils::TargetSpecific::AVX2::sliceHasImplAnyAllImplInt64(first, second, first_null_map, second_null_map);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
if (isArchSupported(TargetArch::SSE42))
|
||||||
|
{
|
||||||
|
if constexpr (std::is_same_v<FirstSliceType, NumericArraySlice<Int8>> || std::is_same_v<FirstSliceType, NumericArraySlice<UInt8>>)
|
||||||
|
{
|
||||||
|
return TargetSpecific::SSE42::sliceHasImplAnyAllImplInt8(first, second, first_null_map, second_null_map);
|
||||||
|
}
|
||||||
|
else if constexpr (std::is_same_v<FirstSliceType, NumericArraySlice<Int16>> || std::is_same_v<FirstSliceType, NumericArraySlice<UInt16>>)
|
||||||
|
{
|
||||||
|
return GatherUtils::TargetSpecific::SSE42::sliceHasImplAnyAllImplInt16(first, second, first_null_map, second_null_map);
|
||||||
|
}
|
||||||
|
else if constexpr (std::is_same_v<FirstSliceType, NumericArraySlice<Int32>> || std::is_same_v<FirstSliceType, NumericArraySlice<UInt32>>)
|
||||||
|
{
|
||||||
|
return GatherUtils::TargetSpecific::SSE42::sliceHasImplAnyAllImplInt32(first, second, first_null_map, second_null_map);
|
||||||
|
}
|
||||||
|
else if constexpr (std::is_same_v<FirstSliceType, NumericArraySlice<Int64>> || std::is_same_v<FirstSliceType, NumericArraySlice<UInt64>>)
|
||||||
|
{
|
||||||
|
return GatherUtils::TargetSpecific::SSE42::sliceHasImplAnyAllImplInt64(first, second, first_null_map, second_null_map);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
return sliceHasImplAnyAllGenericImpl<search_type, FirstSliceType, SecondSliceType, isEqual>(first, second, first_null_map, second_null_map);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
@ -1,8 +1,8 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include <Functions/TargetSpecific.h>
|
|
||||||
#include <Functions/IFunction.h>
|
#include <Functions/IFunction.h>
|
||||||
|
|
||||||
|
#include <Common/TargetSpecific.h>
|
||||||
#include <Common/Stopwatch.h>
|
#include <Common/Stopwatch.h>
|
||||||
#include <Interpreters/Context.h>
|
#include <Interpreters/Context.h>
|
||||||
|
|
||||||
|
@ -6,8 +6,8 @@
|
|||||||
#include <Functions/IFunction.h>
|
#include <Functions/IFunction.h>
|
||||||
#include <Functions/FunctionHelpers.h>
|
#include <Functions/FunctionHelpers.h>
|
||||||
#include <Functions/FunctionFactory.h>
|
#include <Functions/FunctionFactory.h>
|
||||||
#include <Functions/TargetSpecific.h>
|
|
||||||
#include <Functions/PerformanceAdaptors.h>
|
#include <Functions/PerformanceAdaptors.h>
|
||||||
|
#include <Common/TargetSpecific.h>
|
||||||
#include <base/range.h>
|
#include <base/range.h>
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
|
|
||||||
|
139
src/Functions/tests/gtest_has_all.cpp
Normal file
139
src/Functions/tests/gtest_has_all.cpp
Normal file
@ -0,0 +1,139 @@
|
|||||||
|
#include <random>
|
||||||
|
#include <gtest/gtest.h>
|
||||||
|
#include <Functions/GatherUtils/Algorithms.h>
|
||||||
|
|
||||||
|
using namespace DB::GatherUtils;
|
||||||
|
|
||||||
|
|
||||||
|
auto uni_int_dist(int min, int max)
|
||||||
|
{
|
||||||
|
std::random_device rd;
|
||||||
|
std::mt19937 mt(rd());
|
||||||
|
std::uniform_int_distribution<> dist(min, max);
|
||||||
|
return std::make_pair(dist, mt);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class T>
|
||||||
|
void arrayInit(T* elements_to_have, size_t nb_elements_to_have, T* array_elements, size_t array_size, bool all_elements_present)
|
||||||
|
{
|
||||||
|
for (size_t i = 0; i < array_size; ++i)
|
||||||
|
{
|
||||||
|
array_elements[i] = i;
|
||||||
|
}
|
||||||
|
auto [dist, gen] = uni_int_dist(0, array_size - 1);
|
||||||
|
for (size_t i = 0; i < nb_elements_to_have; ++i)
|
||||||
|
{
|
||||||
|
elements_to_have[i] = array_elements[dist(gen)];
|
||||||
|
}
|
||||||
|
if (!all_elements_present)
|
||||||
|
{
|
||||||
|
/// make one element to be searched for missing from the target array
|
||||||
|
elements_to_have[nb_elements_to_have - 1] = array_size + 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void nullMapInit(UInt8 * null_map, size_t null_map_size, size_t nb_null_elements)
|
||||||
|
{
|
||||||
|
/// -2 to keep the last element of the array non-null
|
||||||
|
auto [dist, gen] = uni_int_dist(0, null_map_size - 2);
|
||||||
|
for (size_t i = 0; i < null_map_size; ++i)
|
||||||
|
{
|
||||||
|
null_map[i] = 0;
|
||||||
|
}
|
||||||
|
for (size_t i = 0; i < null_map_size - 1 && i < nb_null_elements; ++i)
|
||||||
|
{
|
||||||
|
null_map[dist(gen)] = 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class T>
|
||||||
|
bool testHasAll(size_t nb_elements_to_have, size_t array_size, bool with_null_maps, bool all_elements_present)
|
||||||
|
{
|
||||||
|
auto array_elements = std::make_unique<T[]>(array_size);
|
||||||
|
auto elements_to_have = std::make_unique<T[]>(nb_elements_to_have);
|
||||||
|
|
||||||
|
std::unique_ptr<UInt8[]> first_nm = nullptr, second_nm = nullptr;
|
||||||
|
if (with_null_maps)
|
||||||
|
{
|
||||||
|
first_nm = std::make_unique<UInt8[]>(array_size);
|
||||||
|
second_nm = std::make_unique<UInt8[]>(nb_elements_to_have);
|
||||||
|
/// add a null to elements to have, but not to the target array, making the answer negative
|
||||||
|
nullMapInit(first_nm.get(), array_size, 0);
|
||||||
|
nullMapInit(second_nm.get(), nb_elements_to_have, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
arrayInit(elements_to_have.get(), nb_elements_to_have, array_elements.get(), array_size, all_elements_present);
|
||||||
|
|
||||||
|
NumericArraySlice<T> first = {array_elements.get(), array_size};
|
||||||
|
NumericArraySlice<T> second = {elements_to_have.get(), nb_elements_to_have};
|
||||||
|
|
||||||
|
/// check whether all elements of the second array are also elements of the first array, overloaded for various combinations of types.
|
||||||
|
return sliceHasImplAnyAll<ArraySearchType::All, NumericArraySlice<T>, NumericArraySlice<T>, sliceEqualElements<T,T> >(
|
||||||
|
first, second, first_nm.get(), second_nm.get());
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(HasAll, integer)
|
||||||
|
{
|
||||||
|
bool test1 = testHasAll<int>(4, 100, false, true);
|
||||||
|
bool test2 = testHasAll<int>(4, 100, false, false);
|
||||||
|
bool test3 = testHasAll<int>(100, 4096, false, true);
|
||||||
|
bool test4 = testHasAll<int>(100, 4096, false, false);
|
||||||
|
|
||||||
|
ASSERT_EQ(test1, true);
|
||||||
|
ASSERT_EQ(test2, false);
|
||||||
|
ASSERT_EQ(test3, true);
|
||||||
|
ASSERT_EQ(test4, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
TEST(HasAll, int64)
|
||||||
|
{
|
||||||
|
bool test1 = testHasAll<int64_t>(2, 100, false, true);
|
||||||
|
bool test2 = testHasAll<int64_t>(2, 100, false, false);
|
||||||
|
bool test3 = testHasAll<int64_t>(100, 4096, false, true);
|
||||||
|
bool test4 = testHasAll<int64_t>(100, 4096, false, false);
|
||||||
|
|
||||||
|
ASSERT_EQ(test1, true);
|
||||||
|
ASSERT_EQ(test2, false);
|
||||||
|
ASSERT_EQ(test3, true);
|
||||||
|
ASSERT_EQ(test4, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(HasAll, int16)
|
||||||
|
{
|
||||||
|
bool test1 = testHasAll<int16_t>(2, 100, false, true);
|
||||||
|
bool test2 = testHasAll<int16_t>(2, 100, false, false);
|
||||||
|
bool test3 = testHasAll<int16_t>(100, 4096, false, true);
|
||||||
|
bool test4 = testHasAll<int16_t>(100, 4096, false, false);
|
||||||
|
|
||||||
|
ASSERT_EQ(test1, true);
|
||||||
|
ASSERT_EQ(test2, false);
|
||||||
|
ASSERT_EQ(test3, true);
|
||||||
|
ASSERT_EQ(test4, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(HasAll, int8)
|
||||||
|
{
|
||||||
|
bool test1 = testHasAll<int8_t>(2, 100, false, true);
|
||||||
|
bool test2 = testHasAll<int8_t>(2, 100, false, false);
|
||||||
|
bool test3 = testHasAll<int8_t>(50, 125, false, true);
|
||||||
|
bool test4 = testHasAll<int8_t>(50, 125, false, false);
|
||||||
|
|
||||||
|
ASSERT_EQ(test1, true);
|
||||||
|
ASSERT_EQ(test2, false);
|
||||||
|
ASSERT_EQ(test3, true);
|
||||||
|
ASSERT_EQ(test4, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(HasAllSingleNullElement, all)
|
||||||
|
{
|
||||||
|
bool test1 = testHasAll<int>(4, 100, true, true);
|
||||||
|
bool test2 = testHasAll<int64_t>(4, 100, true, true);
|
||||||
|
bool test3 = testHasAll<int16_t>(4, 100, true, true);
|
||||||
|
bool test4 = testHasAll<int8_t>(4, 100, true, true);
|
||||||
|
|
||||||
|
ASSERT_EQ(test1, false);
|
||||||
|
ASSERT_EQ(test2, false);
|
||||||
|
ASSERT_EQ(test3, false);
|
||||||
|
ASSERT_EQ(test4, false);
|
||||||
|
}
|
53
tests/performance/has_all.xml
Normal file
53
tests/performance/has_all.xml
Normal file
@ -0,0 +1,53 @@
|
|||||||
|
<test>
|
||||||
|
<substitutions>
|
||||||
|
<substitution>
|
||||||
|
<name>array_type</name>
|
||||||
|
<values>
|
||||||
|
<value>Int8</value>
|
||||||
|
<value>Int16</value>
|
||||||
|
<value>Int32</value>
|
||||||
|
<value>Int64</value>
|
||||||
|
</values>
|
||||||
|
</substitution>
|
||||||
|
</substitutions>
|
||||||
|
|
||||||
|
<create_query>
|
||||||
|
CREATE TABLE test_table_small_{array_type}
|
||||||
|
(
|
||||||
|
`set` Array({array_type}),
|
||||||
|
`subset` Array ({array_type})
|
||||||
|
)
|
||||||
|
ENGINE = MergeTree ORDER BY set;
|
||||||
|
</create_query>
|
||||||
|
|
||||||
|
<create_query>
|
||||||
|
CREATE TABLE test_table_medium_{array_type}
|
||||||
|
(
|
||||||
|
`set` Array({array_type}),
|
||||||
|
`subset` Array ({array_type})
|
||||||
|
)
|
||||||
|
ENGINE = MergeTree ORDER BY set;
|
||||||
|
</create_query>
|
||||||
|
|
||||||
|
<create_query>
|
||||||
|
CREATE TABLE test_table_large_{array_type}
|
||||||
|
(
|
||||||
|
`set` Array({array_type}),
|
||||||
|
`subset` Array ({array_type})
|
||||||
|
)
|
||||||
|
ENGINE = MergeTree ORDER BY set;
|
||||||
|
</create_query>
|
||||||
|
|
||||||
|
|
||||||
|
<fill_query>INSERT INTO test_table_small_{array_type} SELECT groupArraySample(5000)(rand64()) AS set, groupArraySample(500)(rand64()) AS subset FROM numbers(10000000) GROUP BY number % 5000;</fill_query>
|
||||||
|
<fill_query>INSERT INTO test_table_medium_{array_type} SELECT groupArraySample(50000)(rand64()) AS set, groupArraySample(5000)(rand64()) AS subset FROM numbers(25000000) GROUP BY number % 50000;</fill_query>
|
||||||
|
<fill_query>INSERT INTO test_table_large_{array_type} SELECT groupArraySample(500000)(rand64()) AS set, groupArraySample(500000)(rand64()) AS subset FROM numbers(50000000) GROUP BY number % 500000;</fill_query>
|
||||||
|
|
||||||
|
<query>SELECT hasAll(set, subset) FROM test_table_small_{array_type} FORMAT Null</query>
|
||||||
|
<query>SELECT hasAll(set, subset) FROM test_table_medium_{array_type} FORMAT Null</query>
|
||||||
|
<query>SELECT hasAll(set, subset) FROM test_table_large_{array_type} FORMAT Null</query>
|
||||||
|
|
||||||
|
<drop_query>DROP TABLE IF EXISTS test_table_small_{array_type}</drop_query>
|
||||||
|
<drop_query>DROP TABLE IF EXISTS test_table_medium_{array_type}</drop_query>
|
||||||
|
<drop_query>DROP TABLE IF EXISTS test_table_large_{array_type}</drop_query>
|
||||||
|
</test>
|
Loading…
Reference in New Issue
Block a user