Merge branch 'master' into always-backquote-column-names

This commit is contained in:
Alexey Milovidov 2019-03-25 23:26:59 +03:00
commit 7a8234580c
26 changed files with 2223 additions and 913 deletions

3
.gitmodules vendored
View File

@ -76,3 +76,6 @@
[submodule "contrib/brotli"]
path = contrib/brotli
url = https://github.com/google/brotli.git
[submodule "contrib/hyperscan"]
path = contrib/hyperscan
url = https://github.com/ClickHouse-Extras/hyperscan.git

View File

@ -318,6 +318,7 @@ include (cmake/find_pdqsort.cmake)
include (cmake/find_hdfs3.cmake) # uses protobuf
include (cmake/find_consistent-hashing.cmake)
include (cmake/find_base64.cmake)
include (cmake/find_hyperscan.cmake)
find_contrib_lib(cityhash)
find_contrib_lib(farmhash)
find_contrib_lib(metrohash)

View File

@ -0,0 +1,7 @@
if (HAVE_SSSE3)
set (HYPERSCAN_INCLUDE_DIR ${ClickHouse_SOURCE_DIR}/contrib/hyperscan/src)
set (HYPERSCAN_LIBRARY hs)
set (USE_HYPERSCAN 1)
set (USE_INTERNAL_HYPERSCAN_LIBRARY 1)
message (STATUS "Using hyperscan: ${HYPERSCAN_INCLUDE_DIR} " : ${HYPERSCAN_LIBRARY})
endif()

View File

@ -304,3 +304,7 @@ endif ()
if (USE_BASE64)
add_subdirectory (base64-cmake)
endif()
if (USE_HYPERSCAN)
add_subdirectory (hyperscan)
endif()

2
contrib/boost vendored

@ -1 +1 @@
Subproject commit 6a96e8b59f76148eb8ad54a9d15259f8ce84c606
Subproject commit 471ea208abb92a5cba7d3a08a819bb728f27e95f

1
contrib/hyperscan vendored Submodule

@ -0,0 +1 @@
Subproject commit 05dab0efee80be405aad5f74721b692b6889b75e

View File

@ -57,7 +57,7 @@ if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
endif ()
if (NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 8)
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wextra-semi-stmt -Wshadow-field -Wstring-plus-int")
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wextra-semi-stmt -Wshadow-field -Wstring-plus-int -Wempty-init-stmt")
endif ()
if (NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 9)

View File

@ -437,10 +437,10 @@ public:
}
template <typename ResultType, typename AnsCallback>
void searchAll(
void searchAllPositions(
const ColumnString::Chars & haystack_data,
const ColumnString::Offsets & haystack_offsets,
const AnsCallback & ansCallback,
const AnsCallback & ans_callback,
ResultType & ans)
{
const size_t haystack_string_size = haystack_offsets.size();
@ -461,7 +461,7 @@ public:
{
const UInt8 * ptr = fallback_searchers[fallback_needles[i]].search(haystack, haystack_end);
if (ptr != haystack_end)
ans[from + fallback_needles[i]] = ansCallback(haystack, ptr);
ans[from + fallback_needles[i]] = ans_callback(haystack, ptr);
}
/// check if we have one non empty volnitsky searcher
@ -481,7 +481,7 @@ public:
{
if (fallback_searchers[ind].compare(res))
{
ans[from + ind] = ansCallback(haystack, res);
ans[from + ind] = ans_callback(haystack, res);
}
}
}
@ -513,6 +513,16 @@ public:
searchInternal(haystack_data, haystack_offsets, callback, ans);
}
template <typename ResultType, typename CountCharsCallback>
void searchFirstPosition(const ColumnString::Chars & haystack_data, const ColumnString::Offsets & haystack_offsets, const CountCharsCallback & count_chars_callback, ResultType & ans)
{
auto callback = [this, &count_chars_callback](const UInt8 * haystack, const UInt8 * haystack_end) -> size_t
{
return this->searchOneFirstPosition(haystack, haystack_end, count_chars_callback);
};
searchInternal(haystack_data, haystack_offsets, callback, ans);
}
private:
/**
* This function is needed to initialize hash table
@ -582,7 +592,7 @@ private:
inline void searchInternal(
const ColumnString::Chars & haystack_data,
const ColumnString::Offsets & haystack_offsets,
const OneSearcher & searchFallback,
const OneSearcher & search_fallback,
ResultType & ans)
{
const size_t haystack_string_size = haystack_offsets.size();
@ -593,7 +603,7 @@ private:
{
const auto * haystack = &haystack_data[prev_offset];
const auto * haystack_end = haystack + haystack_offsets[j] - prev_offset - 1;
ans[j] = searchFallback(haystack, haystack_end);
ans[j] = search_fallback(haystack, haystack_end);
prev_offset = haystack_offsets[j];
}
}
@ -665,6 +675,41 @@ private:
return ans + 1;
}
template <typename CountCharsCallback>
inline size_t searchOneFirstPosition(const UInt8 * haystack, const UInt8 * haystack_end, const CountCharsCallback & callback) const
{
const size_t fallback_size = fallback_needles.size();
size_t ans = std::numeric_limits<size_t>::max();
for (size_t i = 0; i < fallback_size; ++i)
if (auto pos = fallback_searchers[fallback_needles[i]].search(haystack, haystack_end); pos != haystack_end)
ans = std::min(ans, callback(haystack, pos));
/// check if we have one non empty volnitsky searcher
if (step != std::numeric_limits<size_t>::max())
{
const auto * pos = haystack + step - sizeof(VolnitskyTraits::Ngram);
for (; pos <= haystack_end - sizeof(VolnitskyTraits::Ngram); pos += step)
{
for (size_t cell_num = VolnitskyTraits::toNGram(pos) % VolnitskyTraits::hash_size; hash[cell_num].off;
cell_num = (cell_num + 1) % VolnitskyTraits::hash_size)
{
if (pos >= haystack + hash[cell_num].off - 1)
{
const auto res = pos - (hash[cell_num].off - 1);
const size_t ind = hash[cell_num].id;
if (res + needles[ind].size <= haystack_end && fallback_searchers[ind].compare(res))
ans = std::min(ans, callback(haystack, res));
}
}
}
}
if (ans == std::numeric_limits<size_t>::max())
return 0;
return ans;
}
void putNGramBase(const VolnitskyTraits::Ngram ngram, const int offset, const size_t num)
{
size_t cell_num = ngram % VolnitskyTraits::hash_size;

View File

@ -64,3 +64,8 @@ if (USE_XXHASH)
target_link_libraries(clickhouse_functions PRIVATE ${XXHASH_LIBRARY})
target_include_directories(clickhouse_functions SYSTEM PRIVATE ${XXHASH_INCLUDE_DIR})
endif()
if (USE_HYPERSCAN)
target_link_libraries (clickhouse_functions PRIVATE ${HYPERSCAN_LIBRARY})
target_include_directories (clickhouse_functions SYSTEM PRIVATE ${HYPERSCAN_INCLUDE_DIR})
endif ()

View File

@ -15,6 +15,10 @@
#include <algorithm>
#include <memory>
#ifdef __SSSE3__
# include <hs.h>
#endif
#if USE_RE2_ST
# include <re2_st/re2.h> // Y_IGNORE
#else
@ -312,7 +316,7 @@ struct PositionImpl
};
template <typename Impl>
struct MultiPositionImpl
struct MultiSearchAllPositionsImpl
{
using ResultType = UInt64;
@ -322,17 +326,31 @@ struct MultiPositionImpl
const std::vector<StringRef> & needles,
PaddedPODArray<UInt64> & res)
{
auto resCallback = [](const UInt8 * start, const UInt8 * end) -> UInt64
auto res_callback = [](const UInt8 * start, const UInt8 * end) -> UInt64
{
return 1 + Impl::countChars(reinterpret_cast<const char *>(start), reinterpret_cast<const char *>(end));
};
Impl::createMultiSearcherInBigHaystack(needles).searchAll(haystack_data, haystack_offsets, resCallback, res);
Impl::createMultiSearcherInBigHaystack(needles).searchAllPositions(haystack_data, haystack_offsets, res_callback, res);
}
};
template <typename Impl>
struct MultiSearchImpl
{
using ResultType = UInt8;
static void vector_constant(
const ColumnString::Chars & haystack_data,
const ColumnString::Offsets & haystack_offsets,
const std::vector<StringRef> & needles,
PaddedPODArray<UInt8> & res)
{
Impl::createMultiSearcherInBigHaystack(needles).search(haystack_data, haystack_offsets, res);
}
};
template <typename Impl>
struct MultiSearchFirstPositionImpl
{
using ResultType = UInt64;
@ -342,12 +360,16 @@ struct MultiSearchImpl
const std::vector<StringRef> & needles,
PaddedPODArray<UInt64> & res)
{
Impl::createMultiSearcherInBigHaystack(needles).search(haystack_data, haystack_offsets, res);
auto res_callback = [](const UInt8 * start, const UInt8 * end) -> UInt64
{
return 1 + Impl::countChars(reinterpret_cast<const char *>(start), reinterpret_cast<const char *>(end));
};
Impl::createMultiSearcherInBigHaystack(needles).searchFirstPosition(haystack_data, haystack_offsets, res_callback, res);
}
};
template <typename Impl>
struct FirstMatchImpl
struct MultiSearchFirstIndexImpl
{
using ResultType = UInt64;
@ -524,8 +546,8 @@ struct MatchImpl
res[i] = !revert;
else
{
const char * str_data = reinterpret_cast<const char *>(&data[i != 0 ? offsets[i - 1] : 0]);
size_t str_size = (i != 0 ? offsets[i] - offsets[i - 1] : offsets[0]) - 1;
const char * str_data = reinterpret_cast<const char *>(&data[offsets[i - 1]]);
size_t str_size = offsets[i] - offsets[i - 1] - 1;
/** Even in the case of `required_substring_is_prefix` use UNANCHORED check for regexp,
* so that it can match when `required_substring` occurs into the string several times,
@ -581,6 +603,78 @@ struct MatchImpl
};
template <typename Type, bool FindAny, bool FindAnyIndex>
struct MultiMatchAnyImpl
{
static_assert(static_cast<int>(FindAny) + static_cast<int>(FindAnyIndex) == 1);
using ResultType = Type;
static void vector_constant(
const ColumnString::Chars & haystack_data,
const ColumnString::Offsets & haystack_offsets,
const std::vector<StringRef> & needles,
PaddedPODArray<Type> & res)
{
(void)FindAny;
(void)FindAnyIndex;
#ifdef __SSSE3__
using ScratchPtr = std::unique_ptr<hs_scratch_t, DB::MultiRegexps::HyperscanDeleter<decltype(&hs_free_scratch), &hs_free_scratch>>;
const auto & hyperscan_regex = MultiRegexps::get<FindAnyIndex>(needles);
hs_scratch_t * scratch = nullptr;
hs_error_t err = hs_alloc_scratch(hyperscan_regex->get(), &scratch);
if (err != HS_SUCCESS)
throw Exception("Could not allocate scratch space for hyperscan.", ErrorCodes::CANNOT_ALLOCATE_MEMORY);
ScratchPtr smart_scratch(scratch);
auto on_match = []([[maybe_unused]] unsigned int id,
unsigned long long /* from */,
unsigned long long /* to */,
unsigned int /* flags */,
void * context) -> int
{
if constexpr (FindAnyIndex)
*reinterpret_cast<Type *>(context) = id;
else if constexpr (FindAny)
*reinterpret_cast<Type *>(context) = 1;
return 0;
};
const size_t haystack_offsets_size = haystack_offsets.size();
size_t offset = 0;
for (size_t i = 0; i < haystack_offsets_size; ++i)
{
res[i] = 0;
hs_scan(
hyperscan_regex->get(),
reinterpret_cast<const char *>(haystack_data.data()) + offset,
haystack_offsets[i] - offset - 1,
0,
smart_scratch.get(),
on_match,
&res[i]);
offset = haystack_offsets[i];
}
#else
/// Fallback if not an intel processor
PaddedPODArray<UInt8> accum(res.size());
memset(res.data(), 0, res.size() * sizeof(res.front()));
memset(accum.data(), 0, accum.size());
for (size_t j = 0; j < needles.size(); ++j)
{
MatchImpl<false, false>::vector_constant(haystack_data, haystack_offsets, needles[j].toString(), accum);
for (size_t i = 0; i < res.size(); ++i)
{
if constexpr (FindAny)
res[i] |= accum[i];
else if (accum[i])
res[i] = j + 1;
}
}
#endif // __SSSE3__
}
};
struct ExtractImpl
{
static void vector(
@ -1090,53 +1184,69 @@ struct NamePositionCaseInsensitiveUTF8
{
static constexpr auto name = "positionCaseInsensitiveUTF8";
};
struct NameMultiPosition
struct NameMultiSearchAllPositions
{
static constexpr auto name = "multiPosition";
static constexpr auto name = "multiSearchAllPositions";
};
struct NameMultiPositionUTF8
struct NameMultiSearchAllPositionsUTF8
{
static constexpr auto name = "multiPositionUTF8";
static constexpr auto name = "multiSearchAllPositionsUTF8";
};
struct NameMultiPositionCaseInsensitive
struct NameMultiSearchAllPositionsCaseInsensitive
{
static constexpr auto name = "multiPositionCaseInsensitive";
static constexpr auto name = "multiSearchAllPositionsCaseInsensitive";
};
struct NameMultiPositionCaseInsensitiveUTF8
struct NameMultiSearchAllPositionsCaseInsensitiveUTF8
{
static constexpr auto name = "multiPositionCaseInsensitiveUTF8";
static constexpr auto name = "multiSearchAllPositionsCaseInsensitiveUTF8";
};
struct NameMultiSearch
struct NameMultiSearchAny
{
static constexpr auto name = "multiSearch";
static constexpr auto name = "multiSearchAny";
};
struct NameMultiSearchUTF8
struct NameMultiSearchAnyUTF8
{
static constexpr auto name = "multiSearchUTF8";
static constexpr auto name = "multiSearchAnyUTF8";
};
struct NameMultiSearchCaseInsensitive
struct NameMultiSearchAnyCaseInsensitive
{
static constexpr auto name = "multiSearchCaseInsensitive";
static constexpr auto name = "multiSearchAnyCaseInsensitive";
};
struct NameMultiSearchCaseInsensitiveUTF8
struct NameMultiSearchAnyCaseInsensitiveUTF8
{
static constexpr auto name = "multiSearchCaseInsensitiveUTF8";
static constexpr auto name = "multiSearchAnyCaseInsensitiveUTF8";
};
struct NameFirstMatch
struct NameMultiSearchFirstIndex
{
static constexpr auto name = "firstMatch";
static constexpr auto name = "multiSearchFirstIndex";
};
struct NameFirstMatchUTF8
struct NameMultiSearchFirstIndexUTF8
{
static constexpr auto name = "firstMatchUTF8";
static constexpr auto name = "multiSearchFirstIndexUTF8";
};
struct NameFirstMatchCaseInsensitive
struct NameMultiSearchFirstIndexCaseInsensitive
{
static constexpr auto name = "firstMatchCaseInsensitive";
static constexpr auto name = "multiSearchFirstIndexCaseInsensitive";
};
struct NameFirstMatchCaseInsensitiveUTF8
struct NameMultiSearchFirstIndexCaseInsensitiveUTF8
{
static constexpr auto name = "firstMatchCaseInsensitiveUTF8";
static constexpr auto name = "multiSearchFirstIndexCaseInsensitiveUTF8";
};
struct NameMultiSearchFirstPosition
{
static constexpr auto name = "multiSearchFirstPosition";
};
struct NameMultiSearchFirstPositionUTF8
{
static constexpr auto name = "multiSearchFirstPositionUTF8";
};
struct NameMultiSearchFirstPositionCaseInsensitive
{
static constexpr auto name = "multiSearchFirstPositionCaseInsensitive";
};
struct NameMultiSearchFirstPositionCaseInsensitiveUTF8
{
static constexpr auto name = "multiSearchFirstPositionCaseInsensitiveUTF8";
};
struct NameMatch
{
@ -1150,6 +1260,14 @@ struct NameNotLike
{
static constexpr auto name = "notLike";
};
struct NameMultiMatchAny
{
static constexpr auto name = "multiMatchAny";
};
struct NameMultiMatchAnyIndex
{
static constexpr auto name = "multiMatchAnyIndex";
};
struct NameExtract
{
static constexpr auto name = "extract";
@ -1177,28 +1295,37 @@ using FunctionPositionCaseInsensitive = FunctionsStringSearch<PositionImpl<Posit
using FunctionPositionCaseInsensitiveUTF8
= FunctionsStringSearch<PositionImpl<PositionCaseInsensitiveUTF8>, NamePositionCaseInsensitiveUTF8>;
using FunctionMultiPosition = FunctionsMultiStringPosition<MultiPositionImpl<PositionCaseSensitiveASCII>, NameMultiPosition>;
using FunctionMultiPositionUTF8 = FunctionsMultiStringPosition<MultiPositionImpl<PositionCaseSensitiveUTF8>, NameMultiPositionUTF8>;
using FunctionMultiPositionCaseInsensitive
= FunctionsMultiStringPosition<MultiPositionImpl<PositionCaseInsensitiveASCII>, NameMultiPositionCaseInsensitive>;
using FunctionMultiPositionCaseInsensitiveUTF8
= FunctionsMultiStringPosition<MultiPositionImpl<PositionCaseInsensitiveUTF8>, NameMultiPositionCaseInsensitiveUTF8>;
using FunctionMultiSearchAllPositions = FunctionsMultiStringPosition<MultiSearchAllPositionsImpl<PositionCaseSensitiveASCII>, NameMultiSearchAllPositions>;
using FunctionMultiSearchAllPositionsUTF8 = FunctionsMultiStringPosition<MultiSearchAllPositionsImpl<PositionCaseSensitiveUTF8>, NameMultiSearchAllPositionsUTF8>;
using FunctionMultiSearchAllPositionsCaseInsensitive
= FunctionsMultiStringPosition<MultiSearchAllPositionsImpl<PositionCaseInsensitiveASCII>, NameMultiSearchAllPositionsCaseInsensitive>;
using FunctionMultiSearchAllPositionsCaseInsensitiveUTF8
= FunctionsMultiStringPosition<MultiSearchAllPositionsImpl<PositionCaseInsensitiveUTF8>, NameMultiSearchAllPositionsCaseInsensitiveUTF8>;
using FunctionMultiSearch = FunctionsMultiStringSearch<MultiSearchImpl<PositionCaseSensitiveASCII>, NameMultiSearch>;
using FunctionMultiSearchUTF8 = FunctionsMultiStringSearch<MultiSearchImpl<PositionCaseSensitiveUTF8>, NameMultiSearchUTF8>;
using FunctionMultiSearch = FunctionsMultiStringSearch<MultiSearchImpl<PositionCaseSensitiveASCII>, NameMultiSearchAny>;
using FunctionMultiSearchUTF8 = FunctionsMultiStringSearch<MultiSearchImpl<PositionCaseSensitiveUTF8>, NameMultiSearchAnyUTF8>;
using FunctionMultiSearchCaseInsensitive
= FunctionsMultiStringSearch<MultiSearchImpl<PositionCaseInsensitiveASCII>, NameMultiSearchCaseInsensitive>;
= FunctionsMultiStringSearch<MultiSearchImpl<PositionCaseInsensitiveASCII>, NameMultiSearchAnyCaseInsensitive>;
using FunctionMultiSearchCaseInsensitiveUTF8
= FunctionsMultiStringSearch<MultiSearchImpl<PositionCaseInsensitiveUTF8>, NameMultiSearchCaseInsensitiveUTF8>;
= FunctionsMultiStringSearch<MultiSearchImpl<PositionCaseInsensitiveUTF8>, NameMultiSearchAnyCaseInsensitiveUTF8>;
using FunctionFirstMatch = FunctionsMultiStringSearch<FirstMatchImpl<PositionCaseSensitiveASCII>, NameFirstMatch>;
using FunctionFirstMatchUTF8 = FunctionsMultiStringSearch<FirstMatchImpl<PositionCaseSensitiveUTF8>, NameFirstMatchUTF8>;
using FunctionFirstMatchCaseInsensitive
= FunctionsMultiStringSearch<FirstMatchImpl<PositionCaseInsensitiveASCII>, NameFirstMatchCaseInsensitive>;
using FunctionFirstMatchCaseInsensitiveUTF8
= FunctionsMultiStringSearch<FirstMatchImpl<PositionCaseInsensitiveUTF8>, NameFirstMatchCaseInsensitiveUTF8>;
using FunctionMultiSearchFirstIndex = FunctionsMultiStringSearch<MultiSearchFirstIndexImpl<PositionCaseSensitiveASCII>, NameMultiSearchFirstIndex>;
using FunctionMultiSearchFirstIndexUTF8 = FunctionsMultiStringSearch<MultiSearchFirstIndexImpl<PositionCaseSensitiveUTF8>, NameMultiSearchFirstIndexUTF8>;
using FunctionMultiSearchFirstIndexCaseInsensitive
= FunctionsMultiStringSearch<MultiSearchFirstIndexImpl<PositionCaseInsensitiveASCII>, NameMultiSearchFirstIndexCaseInsensitive>;
using FunctionMultiSearchFirstIndexCaseInsensitiveUTF8
= FunctionsMultiStringSearch<MultiSearchFirstIndexImpl<PositionCaseInsensitiveUTF8>, NameMultiSearchFirstIndexCaseInsensitiveUTF8>;
using FunctionMultiSearchFirstPosition = FunctionsMultiStringSearch<MultiSearchFirstPositionImpl<PositionCaseSensitiveASCII>, NameMultiSearchFirstPosition>;
using FunctionMultiSearchFirstPositionUTF8 = FunctionsMultiStringSearch<MultiSearchFirstPositionImpl<PositionCaseSensitiveUTF8>, NameMultiSearchFirstPositionUTF8>;
using FunctionMultiSearchFirstPositionCaseInsensitive
= FunctionsMultiStringSearch<MultiSearchFirstPositionImpl<PositionCaseInsensitiveASCII>, NameMultiSearchFirstPositionCaseInsensitive>;
using FunctionMultiSearchFirstPositionCaseInsensitiveUTF8
= FunctionsMultiStringSearch<MultiSearchFirstPositionImpl<PositionCaseInsensitiveUTF8>, NameMultiSearchFirstPositionCaseInsensitiveUTF8>;
using FunctionMatch = FunctionsStringSearch<MatchImpl<false>, NameMatch>;
using FunctionMultiMatchAny = FunctionsMultiStringSearch<MultiMatchAnyImpl<UInt8, true, false>, NameMultiMatchAny, std::numeric_limits<UInt32>::max()>;
using FunctionMultiMatchAnyIndex = FunctionsMultiStringSearch<MultiMatchAnyImpl<UInt64, false, true>, NameMultiMatchAnyIndex, std::numeric_limits<UInt32>::max()>;
using FunctionLike = FunctionsStringSearch<MatchImpl<true>, NameLike>;
using FunctionNotLike = FunctionsStringSearch<MatchImpl<true, true>, NameNotLike>;
using FunctionExtract = FunctionsStringSearchToString<ExtractImpl, NameExtract>;
@ -1220,26 +1347,34 @@ void registerFunctionsStringSearch(FunctionFactory & factory)
factory.registerFunction<FunctionPositionCaseInsensitive>();
factory.registerFunction<FunctionPositionCaseInsensitiveUTF8>();
factory.registerFunction<FunctionMultiPosition>();
factory.registerFunction<FunctionMultiPositionUTF8>();
factory.registerFunction<FunctionMultiPositionCaseInsensitive>();
factory.registerFunction<FunctionMultiPositionCaseInsensitiveUTF8>();
factory.registerFunction<FunctionMultiSearchAllPositions>();
factory.registerFunction<FunctionMultiSearchAllPositionsUTF8>();
factory.registerFunction<FunctionMultiSearchAllPositionsCaseInsensitive>();
factory.registerFunction<FunctionMultiSearchAllPositionsCaseInsensitiveUTF8>();
factory.registerFunction<FunctionMultiSearch>();
factory.registerFunction<FunctionMultiSearchUTF8>();
factory.registerFunction<FunctionMultiSearchCaseInsensitive>();
factory.registerFunction<FunctionMultiSearchCaseInsensitiveUTF8>();
factory.registerFunction<FunctionFirstMatch>();
factory.registerFunction<FunctionFirstMatchUTF8>();
factory.registerFunction<FunctionFirstMatchCaseInsensitive>();
factory.registerFunction<FunctionFirstMatchCaseInsensitiveUTF8>();
factory.registerFunction<FunctionMultiSearchFirstIndex>();
factory.registerFunction<FunctionMultiSearchFirstIndexUTF8>();
factory.registerFunction<FunctionMultiSearchFirstIndexCaseInsensitive>();
factory.registerFunction<FunctionMultiSearchFirstIndexCaseInsensitiveUTF8>();
factory.registerFunction<FunctionMultiSearchFirstPosition>();
factory.registerFunction<FunctionMultiSearchFirstPositionUTF8>();
factory.registerFunction<FunctionMultiSearchFirstPositionCaseInsensitive>();
factory.registerFunction<FunctionMultiSearchFirstPositionCaseInsensitiveUTF8>();
factory.registerFunction<FunctionMatch>();
factory.registerFunction<FunctionLike>();
factory.registerFunction<FunctionNotLike>();
factory.registerFunction<FunctionExtract>();
factory.registerFunction<FunctionMultiMatchAny>();
factory.registerFunction<FunctionMultiMatchAnyIndex>();
factory.registerAlias("locate", NamePosition::name, FunctionFactory::CaseInsensitive);
factory.registerAlias("replace", NameReplaceAll::name, FunctionFactory::CaseInsensitive);
}

View File

@ -26,6 +26,8 @@ namespace DB
* notLike(haystack, pattern)
*
* match(haystack, pattern) - search by regular expression re2; Returns 0 or 1.
* multiMatchAny(haystack, [pattern_1, pattern_2, ..., pattern_n]) -- search by re2 regular expressions pattern_i; Returns 0 or 1 if any pattern_i matches.
* multiMatchAnyIndex(haystack, [pattern_1, pattern_2, ..., pattern_n]) -- search by re2 regular expressions pattern_i; Returns index of any match or zero if none;
*
* Applies regexp re2 and pulls:
* - the first subpattern, if the regexp has a subpattern;
@ -39,20 +41,25 @@ namespace DB
* replaceRegexpOne(haystack, pattern, replacement) - replaces the pattern with the specified regexp, only the first occurrence.
* replaceRegexpAll(haystack, pattern, replacement) - replaces the pattern with the specified type, all occurrences.
*
* multiPosition(haystack, [pattern_1, pattern_2, ..., pattern_n]) -- find first occurrences (positions) of all the const patterns inside haystack
* multiPositionUTF8(haystack, [pattern_1, pattern_2, ..., pattern_n])
* multiPositionCaseInsensitive(haystack, [pattern_1, pattern_2, ..., pattern_n])
* multiPositionCaseInsensitiveUTF8(haystack, [pattern_1, pattern_2, ..., pattern_n])
*
* multiSearch(haystack, [pattern_1, pattern_2, ..., pattern_n]) -- find any of the const patterns inside haystack and return 0 or 1
* multiSearchUTF8(haystack, [pattern_1, pattern_2, ..., pattern_n])
* multiSearchCaseInsensitive(haystack, [pattern_1, pattern_2, ..., pattern_n])
* multiSearchCaseInsensitiveUTF8(haystack, [pattern_1, pattern_2, ..., pattern_n])
* multiSearchAllPositions(haystack, [pattern_1, pattern_2, ..., pattern_n]) -- find first occurrences (positions) of all the const patterns inside haystack
* multiSearchAllPositionsUTF8(haystack, [pattern_1, pattern_2, ..., pattern_n])
* multiSearchAllPositionsCaseInsensitive(haystack, [pattern_1, pattern_2, ..., pattern_n])
* multiSearchAllPositionsCaseInsensitiveUTF8(haystack, [pattern_1, pattern_2, ..., pattern_n])
* firstMatch(haystack, [pattern_1, pattern_2, ..., pattern_n]) -- returns the first index of the matched string or zero if nothing was found
* firstMatchUTF8(haystack, [pattern_1, pattern_2, ..., pattern_n])
* firstMatchCaseInsensitive(haystack, [pattern_1, pattern_2, ..., pattern_n])
* firstMatchCaseInsensitiveUTF8(haystack, [pattern_1, pattern_2, ..., pattern_n])
* multiSearchFirstPosition(haystack, [pattern_1, pattern_2, ..., pattern_n]) -- returns the first position of the haystack matched by strings or zero if nothing was found
* multiSearchFirstPositionUTF8(haystack, [pattern_1, pattern_2, ..., pattern_n])
* multiSearchFirstPositionCaseInsensitive(haystack, [pattern_1, pattern_2, ..., pattern_n])
* multiSearchFirstPositionCaseInsensitiveUTF8(haystack, [pattern_1, pattern_2, ..., pattern_n])
*
* multiSearchAny(haystack, [pattern_1, pattern_2, ..., pattern_n]) -- find any of the const patterns inside haystack and return 0 or 1
* multiSearchAnyUTF8(haystack, [pattern_1, pattern_2, ..., pattern_n])
* multiSearchAnyCaseInsensitive(haystack, [pattern_1, pattern_2, ..., pattern_n])
* multiSearchAnyCaseInsensitiveUTF8(haystack, [pattern_1, pattern_2, ..., pattern_n])
* multiSearchFirstIndex(haystack, [pattern_1, pattern_2, ..., pattern_n]) -- returns the first index of the matched string or zero if nothing was found
* multiSearchFirstIndexUTF8(haystack, [pattern_1, pattern_2, ..., pattern_n])
* multiSearchFirstIndexCaseInsensitive(haystack, [pattern_1, pattern_2, ..., pattern_n])
* multiSearchFirstIndexCaseInsensitiveUTF8(haystack, [pattern_1, pattern_2, ..., pattern_n])
*/
namespace ErrorCodes
@ -269,9 +276,13 @@ public:
}
};
template <typename Impl, typename Name>
/// The argument limiting raises from Volnitsky searcher -- it is performance crucial to save only one byte for pattern number.
/// But some other searchers use this function, for example, multiMatchAny -- hyperscan does not have such restrictions
template <typename Impl, typename Name, size_t LimitArgs = std::numeric_limits<UInt8>::max()>
class FunctionsMultiStringSearch : public IFunction
{
static_assert(LimitArgs > 0);
public:
static constexpr auto name = Name::name;
static FunctionPtr create(const Context &) { return std::make_shared<FunctionsMultiStringSearch>(); }
@ -282,10 +293,10 @@ public:
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
{
if (arguments.size() + 1 >= std::numeric_limits<UInt8>::max())
if (arguments.size() + 1 >= LimitArgs)
throw Exception(
"Number of arguments for function " + getName() + " doesn't match: passed " + std::to_string(arguments.size())
+ ", should be at most 255.",
+ ", should be at most " + std::to_string(LimitArgs) + ".",
ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
if (!isString(arguments[0]))
@ -333,6 +344,7 @@ public:
vec_res.resize(column_haystack_size);
/// TODO support constant_constant version
if (col_haystack_vector)
Impl::vector_constant(col_haystack_vector->getChars(), col_haystack_vector->getOffsets(), refs, vec_res);
else

View File

@ -1,10 +1,18 @@
#pragma once
#include <Common/OptimizedRegularExpression.h>
#include <Common/ObjectPool.h>
#include <Common/ProfileEvents.h>
#include <Functions/likePatternToRegexp.h>
#include <Common/ObjectPool.h>
#include <Common/OptimizedRegularExpression.h>
#include <Common/ProfileEvents.h>
#include <common/StringRef.h>
#include <memory>
#include <string>
#include <vector>
#ifdef __SSSE3__
# include <hs.h>
#endif
namespace ProfileEvents
{
@ -14,6 +22,11 @@ namespace ProfileEvents
namespace DB
{
namespace ErrorCodes
{
extern const int CANNOT_ALLOCATE_MEMORY;
extern const int LOGICAL_ERROR;
}
namespace Regexps
{
@ -21,10 +34,16 @@ namespace Regexps
using Pool = ObjectPoolMap<Regexp, String>;
template <bool like>
inline Regexp createRegexp(const std::string & pattern, int flags) { return {pattern, flags}; }
inline Regexp createRegexp(const std::string & pattern, int flags)
{
return {pattern, flags};
}
template <>
inline Regexp createRegexp<true>(const std::string & pattern, int flags) { return {likePatternToRegexp(pattern), flags}; }
inline Regexp createRegexp<true>(const std::string & pattern, int flags)
{
return {likePatternToRegexp(pattern), flags};
}
template <bool like, bool no_capture>
inline Pool::Pointer get(const std::string & pattern)
@ -44,4 +63,82 @@ namespace Regexps
}
}
#ifdef __SSSE3__
namespace MultiRegexps
{
template <typename Deleter, Deleter deleter>
struct HyperscanDeleter
{
template <typename T>
void operator()(T * ptr) const
{
deleter(ptr);
}
};
using Regexps = std::unique_ptr<hs_database_t, HyperscanDeleter<decltype(&hs_free_database), &hs_free_database>>;
using Pool = ObjectPoolMap<Regexps, std::vector<String>>;
template <bool FindAnyIndex>
inline Pool::Pointer get(const std::vector<StringRef> & patterns)
{
/// C++11 has thread-safe function-local statics on most modern compilers.
static Pool known_regexps; /// Different variables for different pattern parameters.
std::vector<String> str_patterns;
str_patterns.reserve(patterns.size());
for (const StringRef & ref : patterns)
str_patterns.push_back(ref.toString());
return known_regexps.get(str_patterns, [&str_patterns]
{
std::vector<const char *> ptrns;
std::vector<unsigned int> flags;
ptrns.reserve(str_patterns.size());
flags.reserve(str_patterns.size());
for (const StringRef ref : str_patterns)
{
ptrns.push_back(ref.data);
flags.push_back(HS_FLAG_DOTALL | HS_FLAG_ALLOWEMPTY | HS_FLAG_SINGLEMATCH);
}
hs_database_t * db = nullptr;
hs_compile_error_t * compile_error;
std::unique_ptr<unsigned int[]> ids;
if constexpr (FindAnyIndex)
{
ids.reset(new unsigned int[ptrns.size()]);
for (size_t i = 0; i < ptrns.size(); ++i)
ids[i] = i + 1;
}
hs_error_t err
= hs_compile_multi(ptrns.data(), flags.data(), ids.get(), ptrns.size(), HS_MODE_BLOCK, nullptr, &db, &compile_error);
if (err != HS_SUCCESS)
{
std::unique_ptr<
hs_compile_error_t,
HyperscanDeleter<decltype(&hs_free_compile_error), &hs_free_compile_error>> error(compile_error);
if (error->expression < 0)
throw Exception(String(error->message), ErrorCodes::LOGICAL_ERROR);
else
throw Exception(
"Pattern '" + str_patterns[error->expression] + "' failed with error '" + String(error->message),
ErrorCodes::LOGICAL_ERROR);
}
ProfileEvents::increment(ProfileEvents::RegexpCreated);
return new Regexps{db};
});
}
}
#endif // __SSSE3__
}

View File

@ -404,6 +404,8 @@ def main(args):
def find_binary(name):
if os.path.exists(name) and os.access(name, os.X_OK):
return True
paths = os.environ.get("PATH").split(':')
for path in paths:
if os.access(os.path.join(path, name), os.X_OK):
@ -416,7 +418,7 @@ if __name__ == '__main__':
parser=ArgumentParser(description='ClickHouse functional tests')
parser.add_argument('-q', '--queries', help='Path to queries dir')
parser.add_argument('--tmp', help='Path to tmp dir')
parser.add_argument('-b', '--binary', default='clickhouse', help='Main clickhouse binary')
parser.add_argument('-b', '--binary', default='clickhouse', help='Path to clickhouse binary or name of binary in PATH')
parser.add_argument('-c', '--client', help='Client program')
parser.add_argument('--extract_from_config', help='extract-from-config program')
parser.add_argument('--configclient', help='Client config (if you use not default ports)')

View File

@ -9,18 +9,18 @@ ROOT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && cd ../.. && pwd)
DATA_DIR=${DATA_DIR:=`mktemp -d /tmp/clickhouse.test..XXXXX`}
DATA_DIR_PATTERN=${DATA_DIR_PATTERN:=/tmp/clickhouse} # path from config file, will be replaced to temporary
LOG_DIR=${LOG_DIR:=$DATA_DIR/log}
export CLICKHOUSE_BINARY=${CLICKHOUSE_BINARY:="clickhouse"}
( [ -x "$ROOT_DIR/dbms/programs/${CLICKHOUSE_BINARY}-server" ] || [ -x "$ROOT_DIR/dbms/programs/${CLICKHOUSE_BINARY}" ] ) && BUILD_DIR=${BUILD_DIR:=$ROOT_DIR} # Build without separate build dir
export CLICKHOUSE_BINARY_NAME=${CLICKHOUSE_BINARY_NAME:="clickhouse"}
( [ -x "$ROOT_DIR/dbms/programs/${CLICKHOUSE_BINARY_NAME}-server" ] || [ -x "$ROOT_DIR/dbms/programs/${CLICKHOUSE_BINARY_NAME}" ] ) && BUILD_DIR=${BUILD_DIR:=$ROOT_DIR} # Build without separate build dir
[ -d "$ROOT_DIR/build${BUILD_TYPE}" ] && BUILD_DIR=${BUILD_DIR:=$ROOT_DIR/build${BUILD_TYPE}}
BUILD_DIR=${BUILD_DIR:=$ROOT_DIR}
[ -x ${CLICKHOUSE_BINARY}-server" ] && [ -x ${CLICKHOUSE_BINARY}-client" ] && BIN_DIR= # Allow run in /usr/bin
( [ -x "$BUILD_DIR/dbms/programs/${CLICKHOUSE_BINARY}" ] || [ -x "$BUILD_DIR/dbms/programs/${CLICKHOUSE_BINARY}-server" ] ) && BIN_DIR=${BIN_DIR:=$BUILD_DIR/dbms/programs/}
[ -x "$BIN_DIR/${CLICKHOUSE_BINARY}-server" ] && CLICKHOUSE_SERVER=${CLICKHOUSE_SERVER:=$BIN_DIR/${CLICKHOUSE_BINARY}-server}
[ -x "$BIN_DIR/${CLICKHOUSE_BINARY}" ] && CLICKHOUSE_SERVER=${CLICKHOUSE_SERVER:=$BIN_DIR/${CLICKHOUSE_BINARY} server}
[ -x "$BIN_DIR/${CLICKHOUSE_BINARY}-client" ] && CLICKHOUSE_CLIENT=${CLICKHOUSE_CLIENT:=$BIN_DIR/${CLICKHOUSE_BINARY}-client}
[ -x "$BIN_DIR/${CLICKHOUSE_BINARY}" ] && CLICKHOUSE_CLIENT=${CLICKHOUSE_CLIENT:=$BIN_DIR/${CLICKHOUSE_BINARY} client}
[ -x "$BIN_DIR/${CLICKHOUSE_BINARY}-extract-from-config" ] && CLICKHOUSE_EXTRACT=${CLICKHOUSE_EXTRACT:=$BIN_DIR/${CLICKHOUSE_BINARY}-extract-from-config}
[ -x "$BIN_DIR/${CLICKHOUSE_BINARY}" ] && CLICKHOUSE_EXTRACT=${CLICKHOUSE_EXTRACT:=$BIN_DIR/${CLICKHOUSE_BINARY} extract-from-config}
[ -x ${CLICKHOUSE_BINARY_NAME}-server" ] && [ -x ${CLICKHOUSE_BINARY_NAME}-client" ] && BIN_DIR= # Allow run in /usr/bin
( [ -x "$BUILD_DIR/dbms/programs/${CLICKHOUSE_BINARY_NAME}" ] || [ -x "$BUILD_DIR/dbms/programs/${CLICKHOUSE_BINARY_NAME}-server" ] ) && BIN_DIR=${BIN_DIR:=$BUILD_DIR/dbms/programs/}
[ -x "$BIN_DIR/${CLICKHOUSE_BINARY_NAME}-server" ] && CLICKHOUSE_SERVER=${CLICKHOUSE_SERVER:=$BIN_DIR/${CLICKHOUSE_BINARY_NAME}-server}
[ -x "$BIN_DIR/${CLICKHOUSE_BINARY_NAME}" ] && CLICKHOUSE_SERVER=${CLICKHOUSE_SERVER:=$BIN_DIR/${CLICKHOUSE_BINARY_NAME} server}
[ -x "$BIN_DIR/${CLICKHOUSE_BINARY_NAME}-client" ] && CLICKHOUSE_CLIENT=${CLICKHOUSE_CLIENT:=$BIN_DIR/${CLICKHOUSE_BINARY_NAME}-client}
[ -x "$BIN_DIR/${CLICKHOUSE_BINARY_NAME}" ] && CLICKHOUSE_CLIENT=${CLICKHOUSE_CLIENT:=$BIN_DIR/${CLICKHOUSE_BINARY_NAME} client}
[ -x "$BIN_DIR/${CLICKHOUSE_BINARY_NAME}-extract-from-config" ] && CLICKHOUSE_EXTRACT=${CLICKHOUSE_EXTRACT:=$BIN_DIR/${CLICKHOUSE_BINARY_NAME}-extract-from-config}
[ -x "$BIN_DIR/${CLICKHOUSE_BINARY_NAME}" ] && CLICKHOUSE_EXTRACT=${CLICKHOUSE_EXTRACT:=$BIN_DIR/${CLICKHOUSE_BINARY_NAME} extract-from-config}
[ -f "$CUR_DIR/server-test.xml" ] && CONFIG_DIR=${CONFIG_DIR=$CUR_DIR}/
CONFIG_CLIENT_DIR=${CONFIG_CLIENT_DIR=$CONFIG_DIR}
@ -131,7 +131,7 @@ else
TEST_DICT=${TEST_DICT=1}
CLICKHOUSE_CLIENT_QUERY="${CLICKHOUSE_CLIENT} --config ${CLICKHOUSE_CONFIG_CLIENT} --port $CLICKHOUSE_PORT_TCP -m -n -q"
$CLICKHOUSE_CLIENT_QUERY 'SELECT * from system.build_options; SELECT * FROM system.clusters;'
CLICKHOUSE_TEST="env ${TEST_DIR}clickhouse-test --binary ${BIN_DIR}${CLICKHOUSE_BINARY} --configclient $CLICKHOUSE_CONFIG_CLIENT --configserver $CLICKHOUSE_CONFIG --tmp $DATA_DIR/tmp --queries $QUERIES_DIR $TEST_OPT0 $TEST_OPT"
CLICKHOUSE_TEST="env ${TEST_DIR}clickhouse-test --binary ${BIN_DIR}${CLICKHOUSE_BINARY_NAME} --configclient $CLICKHOUSE_CONFIG_CLIENT --configserver $CLICKHOUSE_CONFIG --tmp $DATA_DIR/tmp --queries $QUERIES_DIR $TEST_OPT0 $TEST_OPT"
CLICKHOUSE_PERFORMANCE_TEST="${BIN_DIR}clickhouse-performance-test --port $CLICKHOUSE_PORT_TCP --recursive $CUR_DIR/performance --skip-tags=long"
if [ "${TEST_RUN_STRESS}" ]; then
# Running test in parallel will fail some results (tests can create/fill/drop same tables)

View File

@ -23,22 +23,36 @@
</stop_conditions>
<query><![CDATA[select count(position(URL, 'yandex')), count(position(URL, 'google')) FROM hits_100m_single]]></query>
<query><![CDATA[select count(multiPosition(URL, ['yandex', 'google'])) FROM hits_100m_single]]></query>
<query><![CDATA[select count(multiSearchAllPositions(URL, ['yandex', 'google'])) FROM hits_100m_single]]></query>
<query><![CDATA[select count(match(URL, 'yandex|google')) FROM hits_100m_single]]></query>
<query><![CDATA[select count(multiMatchAny(URL, ['yandex', 'google'])) FROM hits_100m_single]]></query>
<query><![CDATA[select sum(match(URL, 'yandex')), sum(match(URL, 'google')), sum(match(URL, 'yahoo')), sum(match(URL, 'pikabu')) FROM hits_100m_single]]></query>
<query><![CDATA[select sum(multiSearch(URL, ['yandex', 'google', 'yahoo', 'pikabu'])) from hits_100m_single]]></query>
<query><![CDATA[select sum(multiSearchAny(URL, ['yandex', 'google', 'yahoo', 'pikabu'])) from hits_100m_single]]></query>
<query><![CDATA[select sum(multiMatchAny(URL, ['yandex', 'google', 'yahoo', 'pikabu'])) from hits_100m_single]]></query>
<query><![CDATA[select sum(match(URL, 'yandex|google|yahoo|pikabu')) FROM hits_100m_single]]></query>
<query><![CDATA[select sum(match(URL, 'yandex')), sum(match(URL, 'google')), sum(match(URL, 'http')) FROM hits_100m_single]]></query>
<query><![CDATA[select sum(multiSearch(URL, ['yandex', 'google', 'http'])) from hits_100m_single]]></query>
<query><![CDATA[select sum(multiSearchAny(URL, ['yandex', 'google', 'http'])) from hits_100m_single]]></query>
<query><![CDATA[select sum(multiMatchAny(URL, ['yandex', 'google', 'http'])) from hits_100m_single]]></query>
<query><![CDATA[select sum(match(URL, 'yandex|google|http')) FROM hits_100m_single]]></query>
<query><![CDATA[select sum(match(URL, 'yandex')), sum(match(URL, 'google')), sum(match(URL, 'facebook')), sum(match(URL, 'wikipedia')), sum(match(URL, 'reddit')) FROM hits_100m_single]]></query>
<query><![CDATA[select sum(multiSearch(URL, ['yandex', 'google', 'facebook', 'wikipedia', 'reddit'])) from hits_100m_single]]></query>
<query><![CDATA[select sum(multiSearchAny(URL, ['yandex', 'google', 'facebook', 'wikipedia', 'reddit'])) from hits_100m_single]]></query>
<query><![CDATA[select sum(multiMatchAny(URL, ['yandex', 'google', 'facebook', 'wikipedia', 'reddit'])) from hits_100m_single]]></query>
<query><![CDATA[select sum(match(URL, 'yandex|google|facebook|wikipedia|reddit')) FROM hits_100m_single]]></query>
<query><![CDATA[select sum(firstMatch(URL, ['yandex', 'google', 'http', 'facebook', 'google'])) from hits_100m_single]]></query>
<query><![CDATA[select sum(multiSearchFirstIndex(URL, ['yandex', 'google', 'http', 'facebook', 'google'])) from hits_100m_single]]></query>
<query><![CDATA[SELECT count() FROM hits_100m_single WHERE multiMatchAny(URL, ['about/address', 'for_woman', '^https?://lm-company.ruy/$', 'ultimateguitar.com'])]]></query>
<query><![CDATA[SELECT count() FROM hits_100m_single WHERE match(URL, 'about/address|for_woman|^https?://lm-company.ruy/$|ultimateguitar.com')]]></query>
<query><![CDATA[SELECT count() FROM hits_100m_single WHERE match(URL, 'chelyabinsk.74.ru|doctor.74.ru|transport.74.ru|m.74.ru|//74.ru/|chel.74.ru|afisha.74.ru|diplom.74.ru|chelfin.ru|//chel.ru|chelyabinsk.ru|cheldoctor.ru|//mychel.ru|cheldiplom.ru|74.ru/video|market|poll|mail|conference|consult|contest|tags|feedback|pages|text')]]></query>
<query><![CDATA[SELECT count() FROM hits_100m_single WHERE multiMatchAny(URL, ['chelyabinsk.74.ru', 'doctor.74.ru', 'transport.74.ru', 'm.74.ru', '//74.ru/', 'chel.74.ru', 'afisha.74.ru', 'diplom.74.ru', 'chelfin.ru', '//chel.ru', 'chelyabinsk.ru', 'cheldoctor.ru', '//mychel.ru', 'cheldiplom.ru', '74.ru/video', 'market', 'poll', 'mail', 'conference', 'consult', 'contest', 'tags', 'feedback', 'pages', 'text'])]]></query>
<query><![CDATA[SELECT count() FROM hits_100m_single WHERE multiMatchAny(URL, ['chelyabinsk\\.74\\.ru', 'doctor\\.74\\.ru', 'transport\\.74\\.ru', 'm\\.74\\.ru', '//74\\.ru/', 'chel\\.74\\.ru', 'afisha\\.74\\.ru', 'diplom\\.74\\.ru', 'chelfin\\.ru', '//chel\\.ru', 'chelyabinsk\\.ru', 'cheldoctor\\.ru', '//mychel\\.ru', 'cheldiplom\\.ru', '74\\.ru/video', 'market', 'poll', 'mail', 'conference', 'consult', 'contest', 'tags', 'feedback', 'pages', 'text'])]]></query>
<query><![CDATA[SELECT count() FROM hits_100m_single WHERE multiSearchAny(URL, ['chelyabinsk.74.ru', 'doctor.74.ru', 'transport.74.ru', 'm.74.ru', '//74.ru/', 'chel.74.ru', 'afisha.74.ru', 'diplom.74.ru', 'chelfin.ru', '//chel.ru', 'chelyabinsk.ru', 'cheldoctor.ru', '//mychel.ru', 'cheldiplom.ru', '74.ru/video', 'market', 'poll', 'mail', 'conference', 'consult', 'contest', 'tags', 'feedback', 'pages', 'text'])]]></query>
<main_metric>
<min_time/>

View File

@ -23192,3 +23192,243 @@
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1

View File

@ -16,7 +16,7 @@ $CLICKHOUSE_CLIENT -q "select getColumnStructure('abc');" 2>&1 | grep "Maybe you
$CLICKHOUSE_CLIENT -q "select gutColumnStructure('abc');" 2>&1 | grep "Maybe you meant: \['dumpColumnStructure'\]" &>/dev/null;
$CLICKHOUSE_CLIENT -q "select gupColumnStructure('abc');" 2>&1 | grep "Maybe you meant: \['dumpColumnStructure'\]" &>/dev/null;
$CLICKHOUSE_CLIENT -q "select provideColumnStructure('abc');" 2>&1 | grep "Maybe you meant: \['dumpColumnStructure'\]" &>/dev/null;
$CLICKHOUSE_CLIENT -q "select multiposicionutf7('abc');" 2>&1 | grep "Maybe you meant: \['multiPositionUTF8','multiPosition'\]" &>/dev/null;
$CLICKHOUSE_CLIENT -q "select multiposicionutf7casesensitive('abc');" 2>&1 | grep "Maybe you meant: \['multiPositionCaseInsensitive'\]" &>/dev/null;
$CLICKHOUSE_CLIENT -q "select multiposicionutf7sensitive('abc');" 2>&1 | grep "Maybe you meant: \['multiPositionCaseInsensitive'\]" &>/dev/null;
$CLICKHOUSE_CLIENT -q "select multiPosicionSensitiveUTF8('abc');" 2>&1 | grep "Maybe you meant: \['multiPositionCaseInsensitiveUTF8'\]" &>/dev/null;
$CLICKHOUSE_CLIENT -q "select multisearchallposicionutf7('abc');" 2>&1 | grep "Maybe you meant: \['multiSearchAllPositionsUTF8','multiSearchAllPositions'\]" &>/dev/null;
$CLICKHOUSE_CLIENT -q "select multisearchallposicionutf7casesensitive('abc');" 2>&1 | grep "Maybe you meant: \['multiSearchAllPositionsCaseInsensitive','multiSearchAllPositionsCaseInsensitiveUTF8'\]." &>/dev/null;
$CLICKHOUSE_CLIENT -q "select multiSearchAllposicionutf7sensitive('abc');" 2>&1 | grep "Maybe you meant: \['multiSearchAllPositionsCaseInsensitive','multiSearchAnyCaseInsensitive'\]." &>/dev/null;
$CLICKHOUSE_CLIENT -q "select multiSearchAllPosicionSensitiveUTF8('abc');" 2>&1 | grep "Maybe you meant: \['multiSearchAnyCaseInsensitiveUTF8','multiSearchAllPositionsCaseInsensitiveUTF8'\]." &>/dev/null;

View File

@ -0,0 +1,601 @@
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1

View File

@ -0,0 +1,81 @@
select 0 = multiMatchAny(materialize('mpnsguhwsitzvuleiwebwjfitmsg'), ['wbirxqoabpblrnvvmjizj', 'cfcxhuvrexyzyjsh', 'oldhtubemyuqlqbwvwwkwin', 'bumoozxdkjglzu', 'intxlfohlxmajjomw', 'dxkeghohv', 'arsvmwwkjeopnlwnan', 'ouugllgowpqtaxslcopkytbfhifaxbgt', 'hkedmjlbcrzvryaopjqdjjc', 'tbqkljywstuahzh', 'o', 'wowoclosyfcuwotmvjygzuzhrery', 'vpefjiffkhlggntcu', 'ytdixvasrorhripzfhjdmlhqksmctyycwp']) from system.numbers limit 10;
select 0 = multiMatchAny(materialize('qjjzqexjpgkglgxpzrbqbnskq'), ['vaiatcjacmlffdzsejpdareqzy', 'xspcfzdufkmecud', 'bcvtbuqtctq', 'nkcopwbfytgemkqcfnnno', 'dylxnzuyhq', 'tno', 'scukuhufly', 'cdyquzuqlptv', 'ohluyfeksyxepezdhqmtfmgkvzsyph', 'ualzwtahvqvtijwp', 'jg', 'gwbawqlngzcknzgtmlj', 'qimvjcgbkkp', 'eaedbcgyrdvv', 'qcwrncjoewwedyyewcdkh', 'uqcvhngoqngmitjfxpznqomertqnqcveoqk', 'ydrgjiankgygpm', 'axepgap']) from system.numbers limit 10;
select 0 = multiMatchAny(materialize('fdkmtqmxnegwvnjhghjq'), ['vynkybvdmhgeezybbdqfrukibisj', 'knazzamgjjpavwhvdkwigykh', 'peumnifrmdhhmrqqnemw', 'lmsnyvqoisinlaqobxojlwfbi', 'oqwfzs', 'dymudxxeodwjpgbibnkvr', 'vomtfsnizkplgzktqyoiw', 'yoyfuhlpgrzds', 'cefao', 'gi', 'srpgxfjwl', 'etsjusdeiwbfe', 'ikvtzdopxo', 'ljfkavrau', 'soqdhxtenfrkmeic', 'ktprjwfcelzbup', 'pcvuoddqwsaurcqdtjfnczekwni', 'agkqkqxkfbkfgyqliahsljim']) from system.numbers limit 10;
select 1 = multiMatchAny(materialize('khljxzxlpcrxpkrfybbfk'), ['', 'lpc', 'rxpkrfybb', 'crxp', '', 'pkr', 'jxzxlpcrxpkrf', '', 'xzxlpcr', 'xpk', 'fyb', 'xzxlpcrxpkrfybbfk', 'k', 'lpcrxp', 'ljxzxlpcr', 'r', 'pkr', 'fk']) from system.numbers limit 10;
select 0 = multiMatchAny(materialize('rbrizgjbigvzfnpgmpkqxoqxvdj'), ['ee', 'cohqnb', 'msol', 'yhlujcvhklnhuomy', 'ietn', 'vgmnlkcsybtokrepzrm', 'wspiryefojxysgrzsxyrluykxfnnbzdstcel', 'mxisnsivndbefqxwznimwgazuulupbaihavg', 'vpzdjvqqeizascxmzdhuq', 'pgvncohlxcqjhfkm', 'mbaypcnfapltsegquurahlsruqvipfhrhq', 'ioxjbcyyqujfveujfhnfdfokfcrlsincjbdt', 'cnvlujyowompdrqjwjx', 'wobwed', 'kdfhaoxiuifotmptcmdbk', 'leoamsnorcvtlmokdomkzuo', 'jjw', 'ogugysetxuqmvggneosbsfbonszepsatq']) from system.numbers limit 10;
select 0 = multiMatchAny(materialize('uymwxzyjbfegbhgswiqhinf'), ['lizxzbzlwljkr', 'ukxygktlpzuyijcqeqktxenlaqi', 'onperabgbdiafsxwbvpjtyt', 'xfqgoqvhqph', 'aflmcwabtwgmajmmqelxwkaolyyhmdlc', 'yfz', 'meffuiaicvwed', 'hhzvgmifzamgftkifaeowayjrnnzw', 'nwewybtajv', 'ectiye', 'epjeiljegmqqjncubj', 'zsjgftqjrn', 'pssng', 'raqoarfhdoeujulvqmdo']) from system.numbers limit 10;
select 0 = multiMatchAny(materialize('omgghgnzjmecpzqmtcvw'), ['fjhlzbszodmzavzg', 'gfofrnwrxprkfiokv', 'jmjiiqpgznlmyrxwewzqzbe', 'pkyrsqkltlmxr', 'crqgkgqkkyujcyoc', 'endagbcxwqhueczuasykmajfsvtcmh', 'xytmxtrnkdysuwltqomehddp', 'etmdxyyfotfyifwvbykghijvwv', 'mwqtgrncyhkfhjdg', 'iuvymofrqpp', 'pgllsdanlhzqhkstwsmzzftp', 'disjylcceufxtjdvhy']) from system.numbers limit 10;
select 1 = multiMatchAny(materialize('mznihnmshftvnmmhnrulizzpslq'), ['nrul', 'mshftvnmmhnr', 'z', 'mhnrulizzps', 'hftvnmmhnrul', 'ihnmshftvnmmhnrulizzp', 'izz', '', 'uli', 'nihnmshftvnmmhnru', 'hnrulizzp', 'nrulizz']) from system.numbers limit 10;
select 1 = multiMatchAny(materialize('ruqmqrsxrbftvruvahonradau'), ['uqmqrsxrbft', 'ftv', 'tvruvahonrad', 'mqrsxrbftvruvahon', 'rbftvruvah', 'qrsxrbftvru', 'o', 'ahonradau', 'a', 'ft', '', 'u', 'rsxrbftvruvahonradau', 'ruvahon', 'bftvruvahonradau', 'qrsxrbftvru', 't', 'vahonrada', 'vruvahonradau', 'onra']) from system.numbers limit 10;
select 1 = multiMatchAny(materialize('gpsevxtcoeexrltyzduyidmtzxf'), ['exrltyzduyid', 'vxtcoeexrltyz', 'xr', 'ltyzduyidmt', 'yzduy', 'exr', 'coeexrltyzduy', 'coeexrltyzduy', 'rlty', 'rltyzduyidm', 'exrltyz', 'xtcoeexrlty', 'vxtcoeexrltyzduyidm', '', 'coeexrl', 'sevxtcoeexrltyzdu', 'dmt', '']) from system.numbers limit 10;
select 0 = multiMatchAny(materialize('dyhycfhzyewaikgursyxfkuv'), ['sktnofpugrmyxmbizzrivmhn', 'fhlgadpoqcvktbfzncxbllvwutdawmw', 'eewzjpcgzrqmltbgmhafwlwqb', 'tpogbkyj', 'rtllntxjgkzs', 'mirbvsqexscnzglogigbujgdwjvcv', 'iktwpgjsakemewmahgqza', 'xgfvzkvqgiuoihjjnxwwpznxhz', 'nxaumpaknreklbwynvxdsmatjekdlxvklh', 'zadzwqhgfxqllihuudozxeixyokhny', 'tdqpgfpzexlkslodps', 'slztannufxaabqfcjyfquafgfhfb', 'xvjldhfuwurvkb', 'aecv', 'uycfsughpikqsbcmwvqygdyexkcykhbnau', 'jr']) from system.numbers limit 10;
select 1 = multiMatchAny(materialize('vbcsettndwuntnruiyclvvwoo'), ['dwuntnru', '', 'ttndwuntnruiyclvv', 'ntnr', 'nruiyclvvw', 'wo', '', 'bcsettndwuntnruiycl', 'yc', 'untnruiyclvvw', 'csettndwuntnr', 'ntnruiyclvvwo']) from system.numbers limit 10;
select 0 = multiMatchAny(materialize('pqqnugshlczcuxhpjxjbcnro'), ['dpeedqy', 'rtsc', 'jdgla', 'qkgudqjiyzvlvsj', 'xmfxawhijgxxtydbd', 'ebgzazqthb', 'wyrjhvhwzhmpybnylirrn', 'iviqbyuclayqketooztwegtkgwnsezfl', 'bhvidy', 'hijctxxweboq', 't', 'osnzfbziidteiaifgaanm']) from system.numbers limit 10;
select 1 = multiMatchAny(materialize('loqchlxspwuvvccucskuytr'), ['', 'k', 'qchlxspwu', 'u', 'hlxspwuvv', 'wuvvccucsku', 'vcc', 'uyt', 'uvv', 'spwu', 'ytr', 'wuvvccucs', 'xspwuv', 'lxspwuvvccuc', 'spwuvvccu', 'oqchlxspwuvvccucskuy']) from system.numbers limit 10;
select 1 = multiMatchAny(materialize('pjjyzupzwllshlnatiujmwvaofr'), ['lnatiujmwvao', '', 'zupzwllsh', 'nati', 'wllshl', 'hlnatiujmwv', 'mwvao', 'shlnat', 'ati', 'wllshlnatiujmwvao', 'wllshlnatiujmwvaofr', 'nat']) from system.numbers limit 10;
select 1 = multiMatchAny(materialize('iketunkleyaqaxdlocci'), ['nkleyaqaxd', 'etunkleyaq', 'yaqaxdlocci', 'tunkleyaq', 'eyaqaxdlocc', 'leyaq', 'nkleyaqaxdl', 'tunkleya', 'kleyaqa', 'etunkleya', 'leyaqa', 'dlo', 'yaqa', 'leyaqaxd', 'etunkleyaq', '']) from system.numbers limit 10;
select 0 = multiMatchAny(materialize('drqianqtangmgbdwruvblkqd'), ['wusajejyucamkyl', 'wsgibljugzrpkniliy', 'lhwqqiuafwffyersqjgjvvvfurx', 'jfokpzzxfdonelorqu', 'ccwkpcgac', 'jmyulqpndkmzbfztobwtm', 'rwrgfkccgxht', 'ggldjecrgbngkonphtcxrkcviujihidjx', 'spwweavbiokizv', 'lv', 'krb', 'vstnhvkbwlqbconaxgbfobqky', 'pvxwdc', 'thrl', 'ahsblffdveamceonqwrbeyxzccmux', 'yozji', 'oejtaxwmeovtqtz', 'zsnzznvqpxdvdxhznxrjn', 'hse', 'kcmkrccxmljzizracxwmpoaggywhdfpxkq']) from system.numbers limit 10;
select 0 = multiMatchAny(materialize('yasnpckniistxcejowfijjsvkdajz'), ['slkpxhtsmrtvtm', 'crsbq', 'rdeshtxbfrlfwpsqojassxmvlfbzefldavmgme', 'ipetilcbpsfroefkjirquciwtxhrimbmwnlyv', 'knjpwkmdwbvdbapuyqbtsw', 'horueidziztxovqhsicnklmharuxhtgrsr', 'ofohrgpz', 'oneqnwyevbaqsonrcpmxcynflojmsnix', 'shg', 'nglqzczevgevwawdfperpeytuodjlf']) from system.numbers limit 10;
select 0 = multiMatchAny(materialize('ueptpscfgxhplwsueckkxs'), ['ohhygchclbpcdwmftperprn', 'dvpjdqmqckekndvcerqrpkxen', 'lohhvarnmyi', 'zppd', 'qmqxgfewitsunbuhffozcpjtc', 'hsjbioisycsrawktqssjovkmltxodjgv', 'dbzuunwbkrtosyvctdujqtvaawfnvuq', 'gupbvpqthqxae', 'abjdmijaaiasnccgxttmqdsz', 'uccyumqoyqe', 'kxxliepyzlc', 'wbqcqtbyyjbqcgdbpkmzugksmcxhvr', 'piedxm', 'uncpphzoif', 'exkdankwck', 'qeitzozdrqopsergzr', 'hesgrhaftgesnzflrrtjdobxhbepjoas', 'wfpexx']) from system.numbers limit 10;
select 1 = multiMatchAny(materialize('ldrzgttlqaphekkkdukgngl'), ['gttlqaphekkkdukgn', 'ekkkd', 'gttlqaphe', 'qaphek', 'h', 'kdu', 'he', 'phek', '', 'drzgttlqaphekkkd']) from system.numbers limit 10;
select 1 = multiMatchAny(materialize('ololo'), ['ololo', 'ololo', 'ololo']);
SELECT 1 = multiMatchAny(materialize('khljxzxlpcrxpkrfybbfk'), ['k']);
select 1 = multiMatchAny(materialize(''), ['']);
select 0 = multiMatchAny(materialize(''), ['some string']);
select 1 = multiMatchAny(materialize('abc'), ['']);
select 1 = multiMatchAny(materialize('abc'), ['']) from system.numbers limit 10;
select 0 = multiMatchAny(materialize('abc'), ['defgh']);
select 0 = multiMatchAny(materialize('abc'), ['defg']);
select 0 = multiMatchAny(materialize('abc'), ['def']);
select 0 = multiMatchAny(materialize('abc'), ['de']);
select 0 = multiMatchAny(materialize('abc'), ['d']);
select 1 = multiMatchAny(materialize('abc'), ['b']) from system.numbers limit 10;
select 1 = multiMatchAny(materialize('abc'), ['bc']) from system.numbers limit 10;
select 0 = multiMatchAny(materialize('abc'), ['bcde']) from system.numbers limit 10;
select 0 = multiMatchAny(materialize('abc'), ['bcdef']) from system.numbers limit 10;
select 0 = multiMatchAny(materialize('abc'), ['bcdefg']) from system.numbers limit 10;
select 0 = multiMatchAny(materialize('abc'), ['bcdefgh']) from system.numbers limit 10;
select 0 = multiMatchAny(materialize('abc'), ['abcdefg']) from system.numbers limit 10;
select 0 = multiMatchAny(materialize('abc'), ['abcdef']) from system.numbers limit 10;
select 0 = multiMatchAny(materialize('abc'), ['abcde']) from system.numbers limit 10;
select 0 = multiMatchAny(materialize('abc'), ['abcd']) from system.numbers limit 10;
select 1 = multiMatchAny(materialize('abc'), ['abc']) from system.numbers limit 10;
select 1 = multiMatchAny(materialize('abc'), ['ab']) from system.numbers limit 10;
select 1 = multiMatchAny(materialize('abc'), ['a']) from system.numbers limit 10;
select 1 = multiMatchAny(materialize('abcd'), ['c']) from system.numbers limit 10;
select 1 = multiMatchAny(materialize('abcd'), ['cd']) from system.numbers limit 10;
select 0 = multiMatchAny(materialize('abcd'), ['cde']) from system.numbers limit 10;
select 0 = multiMatchAny(materialize('abcd'), ['cdef']) from system.numbers limit 10;
select 0 = multiMatchAny(materialize('abcd'), ['cdefg']) from system.numbers limit 10;
select 0 = multiMatchAny(materialize('abcd'), ['cdefgh']) from system.numbers limit 10;
select 0 = multiMatchAny(materialize('abc'), ['defgh']) from system.numbers limit 10;
select 0 = multiMatchAny(materialize('abc'), ['defg']) from system.numbers limit 10;
select 0 = multiMatchAny(materialize('abc'), ['def']) from system.numbers limit 10;
select 0 = multiMatchAny(materialize('abc'), ['de']) from system.numbers limit 10;
select 0 = multiMatchAny(materialize('abc'), ['d']) from system.numbers limit 10;
select 1 = multiMatchAny(materialize('abc'), ['...']) from system.numbers limit 10;
select 1 = multiMatchAny(materialize('a\nbc'), ['a?bc']) from system.numbers limit 10;
select 1 = multiMatchAny(materialize('a\nbc'), ['a.bc']) from system.numbers limit 10;
select 1 = multiMatchAny(materialize('a\0bc'), ['a?bc']) from system.numbers limit 10;
select 1 = multiMatchAny(materialize('a\0bc'), ['a.bc']) from system.numbers limit 10;
select 1 = multiMatchAny(materialize('abcdef'), ['a.....']) from system.numbers limit 10;
select 0 = multiMatchAny(materialize('abcdef'), ['a......']) from system.numbers limit 10;
select 1 = multiMatchAny(materialize('abcdef'), ['a......', 'a.....']) from system.numbers limit 10;
select 0 = multiMatchAny(materialize('aaaa'), ['.*aa.*aaa.*', 'aaaaaa{2}', '\(aa\){3}']) from system.numbers limit 10;
select 1 = multiMatchAny(materialize('abc'), ['a\0d']) from system.numbers limit 10;
select 1 = multiMatchAnyIndex(materialize('gogleuedeyandexgoogle'), ['google', 'yandex1']) from system.numbers limit 10;;
select 2 = multiMatchAnyIndex(materialize('gogleuedeyandexgoogle'), ['google1', 'yandex']) from system.numbers limit 10;;
select 0 != multiMatchAnyIndex(materialize('gogleuedeyandexgoogle'), ['.*goo.*', '.*yan.*']) from system.numbers limit 10;;
select 5 = multiMatchAnyIndex(materialize('vladizlvav dabe don\'t heart me no more'), ['what', 'is', 'love', 'baby', 'no mo??', 'dont', 'h.rt me']) from system.numbers limit 10;;
SELECT multiMatchAny(materialize('/odezhda-dlya-bega/'), ['/odezhda-dlya-bega/', 'kurtki-i-vetrovki-dlya-bega', 'futbolki-i-mayki-dlya-bega']);

View File

@ -10,6 +10,7 @@
| double-conversion | [BSD 3-Clause License](https://github.com/google/double-conversion/blob/cf2f0f3d547dc73b4612028a155b80536902ba02/LICENSE) |
| FastMemcpy | [MIT](https://github.com/yandex/ClickHouse/blob/master/libs/libmemcpy/impl/LICENSE) |
| googletest | [BSD 3-Clause License](https://github.com/google/googletest/blob/master/LICENSE) |
| hyperscan | [BSD 3-Clause License](https://github.com/intel/hyperscan/blob/master/LICENSE) |
| libbtrie | [BSD 2-Clause License](https://github.com/yandex/ClickHouse/blob/master/contrib/libbtrie/LICENSE) |
| libcxxabi | [BSD + MIT](https://github.com/yandex/ClickHouse/blob/master/libs/libglibc-compatibility/libcxxabi/LICENSE.TXT) |
| libdivide | [Zlib License](https://github.com/yandex/ClickHouse/blob/master/contrib/libdivide/LICENSE.txt) |

View File

@ -15,23 +15,29 @@ The same as `position`, but the position is returned in Unicode code points. Wor
For a case-insensitive search, use the function `positionCaseInsensitiveUTF8`.
## multiPosition(haystack, [needle_1, needle_2, ..., needle_n])
## multiSearchAllPositions(haystack, [needle_1, needle_2, ..., needle_n])
The same as `position`, but returns `Array` of the `position`s for all `needle_i`.
For a case-insensitive search or/and in UTF-8 format use functions `multiPositionCaseInsensitive, multiPositionUTF8, multiPositionCaseInsensitiveUTF8`.
For a case-insensitive search or/and in UTF-8 format use functions `multiSearchAllPositionsCaseInsensitive, multiSearchAllPositionsUTF8, multiSearchAllPositionsCaseInsensitiveUTF8`.
## firstMatch(haystack, [needle_1, needle_2, ..., needle_n])
## multiSearchFirstPosition(haystack, [needle_1, needle_2, ..., needle_n])
Returns the index `i` (starting from 1) of the first found `needle_i` in the string `haystack` and 0 otherwise.
The same as `position` but returns the leftmost offset of the string `haystack` that is matched to some of the needles.
For a case-insensitive search or/and in UTF-8 format use functions `firstMatchCaseInsensitive, firstMatchUTF8, firstMatchCaseInsensitiveUTF8`.
For a case-insensitive search or/and in UTF-8 format use functions `multiSearchFirstPositionCaseInsensitive, multiSearchFirstPositionUTF8, multiSearchFirstPositionCaseInsensitiveUTF8`.
## multiSearch(haystack, [needle_1, needle_2, ..., needle_n])
## multiSearchFirstIndex(haystack, [needle_1, needle_2, ..., needle_n])
Returns the index `i` (starting from 1) of the leftmost found `needle_i` in the string `haystack` and 0 otherwise.
For a case-insensitive search or/and in UTF-8 format use functions `multiSearchFirstIndexCaseInsensitive, multiSearchFirstIndexUTF8, multiSearchFirstIndexCaseInsensitiveUTF8`.
## multiSearchAny(haystack, [needle_1, needle_2, ..., needle_n])
Returns 1, if at least one string `needle_i` matches the string `haystack` and 0 otherwise.
For a case-insensitive search or/and in UTF-8 format use functions `multiSearchCaseInsensitive, multiSearchUTF8, multiSearchCaseInsensitiveUTF8`.
For a case-insensitive search or/and in UTF-8 format use functions `multiSearchAnyCaseInsensitive, multiSearchAnyUTF8, multiSearchAnyCaseInsensitiveUTF8`.
## match(haystack, pattern)
@ -44,6 +50,16 @@ Note that the backslash symbol (`\`) is used for escaping in the regular express
The regular expression works with the string as if it is a set of bytes. The regular expression can't contain null bytes.
For patterns to search for substrings in a string, it is better to use LIKE or 'position', since they work much faster.
## multiMatchAny(haystack, [pattern_1, pattern_2, ..., pattern_n])
The same as `match`, but returns 0 if none of the regular expressions are matched and 1 if any of the patterns matches. It uses [hyperscan](https://github.com/intel/hyperscan) algorithm. For patterns to search substrings in a string, it is better to use `multiSearchAny` since it works much faster.
Note: this function is in experimental mode because of some [issues](https://github.com/intel/hyperscan/issues/141).
## multiMatchAnyIndex(haystack, [pattern_1, pattern_2, ..., pattern_n])
The same as `multiMatchAny`, but returns any index that matches the haystack.
## extract(haystack, pattern)
Extracts a fragment of a string using a regular expression. If 'haystack' doesn't match the 'pattern' regex, an empty string is returned. If the regex doesn't contain subpatterns, it takes the fragment that matches the entire regex. Otherwise, it takes the fragment that matches the first subpattern.

View File

@ -10,6 +10,7 @@
| double-conversion | [BSD 3-Clause License](https://github.com/google/double-conversion/blob/cf2f0f3d547dc73b4612028a155b80536902ba02/LICENSE) |
| FastMemcpy | [MIT](https://github.com/yandex/ClickHouse/blob/master/libs/libmemcpy/impl/LICENSE) |
| googletest | [BSD 3-Clause License](https://github.com/google/googletest/blob/master/LICENSE) |
| hyperscan | [BSD 3-Clause License](https://github.com/intel/hyperscan/blob/master/LICENSE) |
| libbtrie | [BSD 2-Clause License](https://github.com/yandex/ClickHouse/blob/master/contrib/libbtrie/LICENSE) |
| libcxxabi | [BSD + MIT](https://github.com/yandex/ClickHouse/blob/master/libs/libglibc-compatibility/libcxxabi/LICENSE.TXT) |
| libdivide | [Zlib License](https://github.com/yandex/ClickHouse/blob/master/contrib/libdivide/LICENSE.txt) |

View File

@ -13,20 +13,26 @@
Для поиска без учета регистра используйте функцию `positionCaseInsensitiveUTF8`.
## multiPosition(haystack, [needle_1, needle_2, ..., needle_n])
## multiSearchAllPositions(haystack, [needle_1, needle_2, ..., needle_n])
Так же, как и `position`, только возвращает `Array` первых вхождений.
Для поиска без учета регистра и/или в кодировке UTF-8 используйте функции `multiPositionCaseInsensitive, multiPositionUTF8, multiPositionCaseInsensitiveUTF8`.
Для поиска без учета регистра и/или в кодировке UTF-8 используйте функции `multiSearchAllPositionsCaseInsensitive, multiSearchAllPositionsUTF8, multiSearchAllPositionsCaseInsensitiveUTF8`.
## firstMatch(haystack, [needle_1, needle_2, ..., needle_n])
## multiSearchFirstPosition(haystack, [needle_1, needle_2, ..., needle_n])
Так же, как и `position`, только возвращает оффсет первого вхождения любого из needles.
Для поиска без учета регистра и/или в кодировке UTF-8 используйте функции `multiSearchFirstPositionCaseInsensitive, multiSearchFirstPositionUTF8, multiSearchFirstPositionCaseInsensitiveUTF8`.
## multiSearchFirstIndex(haystack, [needle_1, needle_2, ..., needle_n])
Возвращает индекс `i` (нумерация с единицы) первой найденной строки `needle_i` в строке `haystack` и 0 иначе.
Для поиска без учета регистра и/или в кодировке UTF-8 используйте функции `firstMatchCaseInsensitive, firstMatchUTF8, firstMatchCaseInsensitiveUTF8`.
Для поиска без учета регистра и/или в кодировке UTF-8 используйте функции `multiSearchFirstIndexCaseInsensitive, multiSearchFirstIndexUTF8, multiSearchFirstIndexCaseInsensitiveUTF8`.
## multiSearch(haystack, [needle_1, needle_2, ..., needle_n])
## multiSearchAny(haystack, [needle_1, needle_2, ..., needle_n])
Возвращает 1, если хотя бы одна подстрока `needle_i` нашлась в строке `haystack` и 0 иначе.
Для поиска без учета регистра и/или в кодировке UTF-8 используйте функции `multiSearchCaseInsensitive, multiSearchUTF8, multiSearchCaseInsensitiveUTF8`.
Для поиска без учета регистра и/или в кодировке UTF-8 используйте функции `multiSearchAnyCaseInsensitive, multiSearchAnyUTF8, multiSearchAnyCaseInsensitiveUTF8`.
## match(haystack, pattern)
Проверка строки на соответствие регулярному выражению pattern. Регулярное выражение **re2**. Синтаксис регулярных выражений **re2** является более ограниченным по сравнению с регулярными выражениями **Perl** ([подробнее](https://github.com/google/re2/wiki/Syntax)).
@ -37,6 +43,16 @@
Регулярное выражение работает со строкой как с набором байт. Регулярное выражение не может содержать нулевые байты.
Для шаблонов на поиск подстроки в строке, лучше используйте LIKE или position, так как они работают существенно быстрее.
## multiMatchAny(haystack, [pattern_1, pattern_2, ..., pattern_n])
То же, что и `match`, но возвращает ноль, если ни одно регулярное выражение не подошло и один, если хотя бы одно. Используется алгоритм [hyperscan](https://github.com/intel/hyperscan) для соответствия регулярных выражений. Для шаблонов на поиск многих подстрок в строке, лучше используйте `multiSearchAny`, так как она работает существенно быстрее.
Примечание: эта функция находится пока в экспериментальном режиме из-за некоторых [проблем](https://github.com/intel/hyperscan/issues/141).
## multiMatchAnyIndex(haystack, [pattern_1, pattern_2, ..., pattern_n])
То же, что `multiMatchAny`, только возвращает любой индекс подходящего регулярного выражения.
## extract(haystack, pattern)
Извлечение фрагмента строки по регулярному выражению. Если haystack не соответствует регулярному выражению pattern, то возвращается пустая строка. Если регулярное выражение не содержит subpattern-ов, то вынимается фрагмент, который подпадает под всё регулярное выражение. Иначе вынимается фрагмент, который подпадает под первый subpattern.

View File

@ -1,6 +1,6 @@
# MaterializedView
# 物化视图
Used for implementing materialized views (for more information, see [CREATE TABLE](../../query_language/create.md)). For storing data, it uses a different engine that was specified when creating the view. When reading from a table, it just uses this engine.
物化视图的使用(更多信息请参阅 [CREATE TABLE](../../query_language/create.md) )。它需要使用一个不同的引擎来存储数据,这个引擎要在创建物化视图时指定。当从表中读取时,它就会使用该引擎。
[Original article](https://clickhouse.yandex/docs/en/operations/table_engines/materializedview/) <!--hide-->
[来源文章](https://clickhouse.yandex/docs/en/operations/table_engines/materializedview/) <!--hide-->

View File

@ -1,30 +1,30 @@
# Merge
The `Merge` engine (not to be confused with `MergeTree`) does not store data itself, but allows reading from any number of other tables simultaneously.
Reading is automatically parallelized. Writing to a table is not supported. When reading, the indexes of tables that are actually being read are used, if they exist.
The `Merge` engine accepts parameters: the database name and a regular expression for tables.
`Merge` 引擎 (不要跟 `MergeTree` 引擎混淆) 本身不存储数据,但可用于同时从任意多个其他的表中读取数据。
读是自动并行的,不支持写入。读取时,那些被真正读取到数据的表的索引(如果有的话)会被使用。
`Merge` 引擎的参数:一个数据库名和一个用于匹配表名的正则表达式。
Example:
示例:
```
Merge(hits, '^WatchLog')
```
Data will be read from the tables in the `hits` database that have names that match the regular expression '`^WatchLog`'.
数据会从 `hits` 数据库中表名匹配正则 '`^WatchLog`' 的表中读取。
Instead of the database name, you can use a constant expression that returns a string. For example, `currentDatabase()`.
除了数据库名,你也可以用一个返回字符串的常量表达式。例如, `currentDatabase()`
Regular expressions — [re2](https://github.com/google/re2) (supports a subset of PCRE), case-sensitive.
See the notes about escaping symbols in regular expressions in the "match" section.
正则表达式 — [re2](https://github.com/google/re2) (支持 PCRE 一个子集的功能),大小写敏感。
了解关于正则表达式中转义字符的说明可参看 "match" 一节。
When selecting tables to read, the `Merge` table itself will not be selected, even if it matches the regex. This is to avoid loops.
It is possible to create two `Merge` tables that will endlessly try to read each others' data, but this is not a good idea.
当选择需要读的表时,`Merge` 表本身会被排除,即使它匹配上了该正则。这样设计为了避免循环。
当然,是能够创建两个相互无限递归读取对方数据的 `Merge` 表的,但这并没有什么意义。
The typical way to use the `Merge` engine is for working with a large number of `TinyLog` tables as if with a single table.
`Merge` 引擎的一个典型应用是可以像使用一张表一样使用大量的 `TinyLog` 表。
Example 2:
示例 2
Let's say you have a old table (WatchLog_old) and decided to change partitioning without moving data to a new table (WatchLog_new) and you need to see data from both tables.
我们假定你有一个旧表WatchLog_old你想改变数据分区了但又不想把旧数据转移到新表WatchLog_new并且你需要同时能看到这两个表的数据。
```
CREATE TABLE WatchLog_old(date Date, UserId Int64, EventType String, Cnt UInt64)
@ -49,21 +49,21 @@ FROM WatchLog
```
## Virtual Columns
## 虚拟列
Virtual columns are columns that are provided by the table engine, regardless of the table definition. In other words, these columns are not specified in `CREATE TABLE`, but they are accessible for `SELECT`.
虚拟列是一种由表引擎提供而不是在表定义中的列。换种说法就是,这些列并没有在 `CREATE TABLE` 中指定,但可以在 `SELECT` 中使用。
Virtual columns differ from normal columns in the following ways:
下面列出虚拟列跟普通列的不同点:
- They are not specified in table definitions.
- Data can't be added to them with `INSERT`.
- When using `INSERT` without specifying the list of columns, virtual columns are ignored.
- They are not selected when using the asterisk (`SELECT *`).
- Virtual columns are not shown in `SHOW CREATE TABLE` and `DESC TABLE` queries.
- 虚拟列不在表结构定义里指定。
- 不能用 `INSERT` 向虚拟列写数据。
- 使用不指定列名的 `INSERT` 语句时,虚拟列要会被忽略掉。
- 使用星号通配符( `SELECT *` )时虚拟列不会包含在里面。
- 虚拟列不会出现在 `SHOW CREATE TABLE``DESC TABLE` 的查询结果里。
The `Merge` type table contains a virtual `_table` column of the `String` type. (If the table already has a `_table` column, the virtual column is called `_table1`; if you already have `_table1`, it's called `_table2`, and so on.) It contains the name of the table that data was read from.
`Merge` 类型的表包括一个 `String` 类型的 `_table` 虚拟列。(如果该表本来已有了一个 `_table` 的列,那这个虚拟列会命名为 `_table1` ;如果 `_table1` 也本就存在了,那这个虚拟列会被命名为 `_table2` ,依此类推)该列包含被读数据的表名。
If the `WHERE/PREWHERE` clause contains conditions for the `_table` column that do not depend on other table columns (as one of the conjunction elements, or as an entire expression), these conditions are used as an index. The conditions are performed on a data set of table names to read data from, and the read operation will be performed from only those tables that the condition was triggered on.
如果 `WHERE/PREWHERE` 子句包含了带 `_table` 的条件,并且没有依赖其他的列(如作为表达式谓词链接的一个子项或作为整个的表达式),这些条件的作用会像索引一样。这些条件会在那些可能被读数据的表的表名上执行,并且读操作只会在那些满足了该条件的表上去执行。
[Original article](https://clickhouse.yandex/docs/en/operations/table_engines/merge/) <!--hide-->
[来源文章](https://clickhouse.yandex/docs/en/operations/table_engines/merge/) <!--hide-->