mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-12-18 04:12:19 +00:00
Merge branch 'master' into remove-compiler
This commit is contained in:
commit
32a44936e0
@ -264,11 +264,10 @@ if (USE_STATIC_LIBRARIES AND HAVE_NO_PIE)
|
||||
set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${FLAG_NO_PIE}")
|
||||
endif ()
|
||||
|
||||
# TODO: only make this extra-checks in CI builds, since a lot of contrib libs won't link -
|
||||
# CI works around this problem by explicitly adding GLIBC_COMPATIBILITY flag.
|
||||
if (NOT SANITIZE AND YANDEX_OFFICIAL_BUILD)
|
||||
set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,--no-undefined")
|
||||
set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,--no-undefined")
|
||||
# Make this extra-checks for correct library dependencies.
|
||||
if (NOT SANITIZE)
|
||||
set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,--no-undefined")
|
||||
set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,--no-undefined")
|
||||
endif ()
|
||||
|
||||
include (cmake/find_unwind.cmake)
|
||||
|
@ -219,7 +219,7 @@ endif()
|
||||
add_library(${ARROW_LIBRARY} ${ARROW_SRCS})
|
||||
add_dependencies(${ARROW_LIBRARY} protoc)
|
||||
target_include_directories(${ARROW_LIBRARY} SYSTEM PUBLIC ${ClickHouse_SOURCE_DIR}/contrib/arrow/cpp/src PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/cpp/src ${Boost_INCLUDE_DIRS})
|
||||
target_link_libraries(${ARROW_LIBRARY} PRIVATE ${DOUBLE_CONVERSION_LIBRARIES} ${PROTOBUF_LIBRARIES} Threads::Threads)
|
||||
target_link_libraries(${ARROW_LIBRARY} PRIVATE ${DOUBLE_CONVERSION_LIBRARIES} ${Protobuf_LIBRARY} Threads::Threads)
|
||||
if (ARROW_WITH_LZ4)
|
||||
target_link_libraries(${ARROW_LIBRARY} PRIVATE ${LZ4_LIBRARY})
|
||||
endif()
|
||||
|
@ -176,7 +176,7 @@ struct QuantileExactExclusive : public QuantileExact<Value>
|
||||
}
|
||||
};
|
||||
|
||||
/// QuantileExactInclusive is equivalent to Excel PERCENTILE and PERCENTILE.INC, R-7, SciPy-(1,1)
|
||||
/// QuantileExactInclusive is equivalent to Excel PERCENTILE and PERCENTILE.INC, R-7, SciPy-(1,1)
|
||||
template <typename Value>
|
||||
struct QuantileExactInclusive : public QuantileExact<Value>
|
||||
{
|
||||
|
@ -1,4 +1,5 @@
|
||||
#include <Common/Exception.h>
|
||||
#include <Common/PODArray.h>
|
||||
#include <Common/OptimizedRegularExpression.h>
|
||||
|
||||
#define MIN_LENGTH_FOR_STRSTR 3
|
||||
@ -413,9 +414,9 @@ unsigned OptimizedRegularExpressionImpl<thread_safe>::match(const char * subject
|
||||
return 0;
|
||||
}
|
||||
|
||||
StringPieceType pieces[MAX_SUBPATTERNS];
|
||||
DB::PODArrayWithStackMemory<StringPieceType, sizeof(StringPieceType) * (MAX_SUBPATTERNS+1)> pieces(limit);
|
||||
|
||||
if (!re2->Match(StringPieceType(subject, subject_size), 0, subject_size, RegexType::UNANCHORED, pieces, limit))
|
||||
if (!re2->Match(StringPieceType(subject, subject_size), 0, subject_size, RegexType::UNANCHORED, pieces.data(), pieces.size()))
|
||||
return 0;
|
||||
else
|
||||
{
|
||||
|
@ -1,5 +1,7 @@
|
||||
#pragma once
|
||||
|
||||
#include <Common/Exception.h>
|
||||
#include <Common/StringUtils/StringUtils.h>
|
||||
#include <Common/UTF8Helpers.h>
|
||||
#include <Core/Defines.h>
|
||||
#include <ext/range.h>
|
||||
@ -23,6 +25,7 @@ namespace DB
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int UNSUPPORTED_PARAMETER;
|
||||
extern const int BAD_ARGUMENTS;
|
||||
}
|
||||
|
||||
|
||||
@ -157,7 +160,7 @@ public:
|
||||
#endif
|
||||
}
|
||||
|
||||
ALWAYS_INLINE bool compare(const UInt8 * pos) const
|
||||
ALWAYS_INLINE bool compare(const UInt8 * /*haystack*/, const UInt8 * /*haystack_end*/, const UInt8 * pos) const
|
||||
{
|
||||
static const Poco::UTF8Encoding utf8;
|
||||
|
||||
@ -374,7 +377,7 @@ public:
|
||||
#endif
|
||||
}
|
||||
|
||||
ALWAYS_INLINE bool compare(const UInt8 * pos) const
|
||||
ALWAYS_INLINE bool compare(const UInt8 * /*haystack*/, const UInt8 * /*haystack_end*/, const UInt8 * pos) const
|
||||
{
|
||||
#ifdef __SSE4_1__
|
||||
if (pageSafe(pos))
|
||||
@ -567,7 +570,7 @@ public:
|
||||
#endif
|
||||
}
|
||||
|
||||
ALWAYS_INLINE bool compare(const UInt8 * pos) const
|
||||
ALWAYS_INLINE bool compare(const UInt8 * /*haystack*/, const UInt8 * /*haystack_end*/, const UInt8 * pos) const
|
||||
{
|
||||
#ifdef __SSE4_1__
|
||||
if (pageSafe(pos))
|
||||
@ -697,11 +700,81 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
// Searches for needle surrounded by token-separators.
|
||||
// Separators are anything inside ASCII (0-128) and not alphanum.
|
||||
// Any value outside of basic ASCII (>=128) is considered a non-separator symbol, hence UTF-8 strings
|
||||
// should work just fine. But any Unicode whitespace is not considered a token separtor.
|
||||
template <typename StringSearcher>
|
||||
class TokenSearcher
|
||||
{
|
||||
StringSearcher searcher;
|
||||
size_t needle_size;
|
||||
|
||||
public:
|
||||
TokenSearcher(const char * const needle_, const size_t needle_size_)
|
||||
: searcher{needle_, needle_size_},
|
||||
needle_size(needle_size_)
|
||||
{
|
||||
if (std::any_of(reinterpret_cast<const UInt8 *>(needle_), reinterpret_cast<const UInt8 *>(needle_) + needle_size_, isTokenSeparator))
|
||||
{
|
||||
throw Exception{"Needle must not contain whitespace or separator characters", ErrorCodes::BAD_ARGUMENTS};
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
ALWAYS_INLINE bool compare(const UInt8 * haystack, const UInt8 * haystack_end, const UInt8 * pos) const
|
||||
{
|
||||
// use searcher only if pos is in the beginning of token and pos + searcher.needle_size is end of token.
|
||||
if (isToken(haystack, haystack_end, pos))
|
||||
return searcher.compare(haystack, haystack_end, pos);
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
const UInt8 * search(const UInt8 * haystack, const UInt8 * const haystack_end) const
|
||||
{
|
||||
// use searcher.search(), then verify that returned value is a token
|
||||
// if it is not, skip it and re-run
|
||||
|
||||
const UInt8 * pos = haystack;
|
||||
while (pos < haystack_end)
|
||||
{
|
||||
pos = searcher.search(pos, haystack_end);
|
||||
if (pos == haystack_end || isToken(haystack, haystack_end, pos))
|
||||
return pos;
|
||||
|
||||
// assuming that heendle does not contain any token separators.
|
||||
pos += needle_size;
|
||||
}
|
||||
return haystack_end;
|
||||
}
|
||||
|
||||
const UInt8 * search(const UInt8 * haystack, const size_t haystack_size) const
|
||||
{
|
||||
return search(haystack, haystack + haystack_size);
|
||||
}
|
||||
|
||||
ALWAYS_INLINE bool isToken(const UInt8 * haystack, const UInt8 * const haystack_end, const UInt8* p) const
|
||||
{
|
||||
return (p == haystack || isTokenSeparator(*(p - 1)))
|
||||
&& (p + needle_size >= haystack_end || isTokenSeparator(*(p + needle_size)));
|
||||
}
|
||||
|
||||
ALWAYS_INLINE static bool isTokenSeparator(const UInt8 c)
|
||||
{
|
||||
if (isAlphaNumericASCII(c) || !isASCII(c))
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
using ASCIICaseSensitiveStringSearcher = StringSearcher<true, true>;
|
||||
using ASCIICaseInsensitiveStringSearcher = StringSearcher<false, true>;
|
||||
using UTF8CaseSensitiveStringSearcher = StringSearcher<true, false>;
|
||||
using UTF8CaseInsensitiveStringSearcher = StringSearcher<false, false>;
|
||||
using ASCIICaseSensitiveTokenSearcher = TokenSearcher<ASCIICaseSensitiveStringSearcher>;
|
||||
|
||||
|
||||
/** Uses functions from libc.
|
||||
|
@ -327,6 +327,8 @@ protected:
|
||||
FallbackSearcher fallback_searcher;
|
||||
|
||||
public:
|
||||
using Searcher = FallbackSearcher;
|
||||
|
||||
/** haystack_size_hint - the expected total size of the haystack for `search` calls. Optional (zero means unspecified).
|
||||
* If you specify it small enough, the fallback algorithm will be used,
|
||||
* since it is considered that it's useless to waste time initializing the hash table.
|
||||
@ -373,7 +375,7 @@ public:
|
||||
const auto res = pos - (hash[cell_num] - 1);
|
||||
|
||||
/// pointer in the code is always padded array so we can use pagesafe semantics
|
||||
if (fallback_searcher.compare(res))
|
||||
if (fallback_searcher.compare(haystack, haystack_end, res))
|
||||
return res;
|
||||
}
|
||||
}
|
||||
@ -520,7 +522,7 @@ public:
|
||||
{
|
||||
const auto res = pos - (hash[cell_num].off - 1);
|
||||
const size_t ind = hash[cell_num].id;
|
||||
if (res + needles[ind].size <= haystack_end && fallback_searchers[ind].compare(res))
|
||||
if (res + needles[ind].size <= haystack_end && fallback_searchers[ind].compare(haystack, haystack_end, res))
|
||||
return true;
|
||||
}
|
||||
}
|
||||
@ -552,7 +554,7 @@ public:
|
||||
{
|
||||
const auto res = pos - (hash[cell_num].off - 1);
|
||||
const size_t ind = hash[cell_num].id;
|
||||
if (res + needles[ind].size <= haystack_end && fallback_searchers[ind].compare(res))
|
||||
if (res + needles[ind].size <= haystack_end && fallback_searchers[ind].compare(haystack, haystack_end, res))
|
||||
ans = std::min(ans, ind);
|
||||
}
|
||||
}
|
||||
@ -590,7 +592,7 @@ public:
|
||||
{
|
||||
const auto res = pos - (hash[cell_num].off - 1);
|
||||
const size_t ind = hash[cell_num].id;
|
||||
if (res + needles[ind].size <= haystack_end && fallback_searchers[ind].compare(res))
|
||||
if (res + needles[ind].size <= haystack_end && fallback_searchers[ind].compare(haystack, haystack_end, res))
|
||||
ans = std::min<UInt64>(ans, res - haystack);
|
||||
}
|
||||
}
|
||||
@ -625,7 +627,7 @@ public:
|
||||
{
|
||||
const auto * res = pos - (hash[cell_num].off - 1);
|
||||
const size_t ind = hash[cell_num].id;
|
||||
if (ans[ind] == 0 && res + needles[ind].size <= haystack_end && fallback_searchers[ind].compare(res))
|
||||
if (ans[ind] == 0 && res + needles[ind].size <= haystack_end && fallback_searchers[ind].compare(haystack, haystack_end, res))
|
||||
ans[ind] = count_chars(haystack, res);
|
||||
}
|
||||
}
|
||||
@ -650,6 +652,8 @@ using VolnitskyUTF8 = VolnitskyBase<true, false, ASCIICaseSensitiveStringSearche
|
||||
using VolnitskyCaseInsensitive = VolnitskyBase<false, true, ASCIICaseInsensitiveStringSearcher>; /// ignores non-ASCII bytes
|
||||
using VolnitskyCaseInsensitiveUTF8 = VolnitskyBase<false, false, UTF8CaseInsensitiveStringSearcher>;
|
||||
|
||||
using VolnitskyToken = VolnitskyBase<true, true, ASCIICaseSensitiveTokenSearcher>;
|
||||
|
||||
using MultiVolnitsky = MultiVolnitskyBase<true, true, ASCIICaseSensitiveStringSearcher>;
|
||||
using MultiVolnitskyUTF8 = MultiVolnitskyBase<true, false, ASCIICaseSensitiveStringSearcher>;
|
||||
using MultiVolnitskyCaseInsensitive = MultiVolnitskyBase<false, true, ASCIICaseInsensitiveStringSearcher>;
|
||||
|
@ -234,7 +234,7 @@ void DataTypeEnum<Type>::deserializeBinaryBulk(
|
||||
}
|
||||
|
||||
template <typename Type>
|
||||
void DataTypeEnum<Type>::serializeProtobuf(const IColumn & column, size_t row_num, ProtobufWriter & protobuf, size_t & value_index) const
|
||||
void DataTypeEnum<Type>::serializeProtobuf(const IColumn & column, size_t row_num, ProtobufWriter & protobuf, size_t & value_index) const
|
||||
{
|
||||
if (value_index)
|
||||
return;
|
||||
|
@ -434,6 +434,74 @@ struct MultiSearchFirstIndexImpl
|
||||
}
|
||||
};
|
||||
|
||||
/** Token search the string, means that needle must be surrounded by some separator chars, like whitespace or puctuation.
|
||||
*/
|
||||
template <bool negate_result = false>
|
||||
struct HasTokenImpl
|
||||
{
|
||||
using ResultType = UInt8;
|
||||
|
||||
static void vector_constant(
|
||||
const ColumnString::Chars & data, const ColumnString::Offsets & offsets, const std::string & pattern, PaddedPODArray<UInt8> & res)
|
||||
{
|
||||
if (offsets.empty())
|
||||
return;
|
||||
|
||||
const UInt8 * begin = data.data();
|
||||
const UInt8 * pos = begin;
|
||||
const UInt8 * end = pos + data.size();
|
||||
|
||||
/// The current index in the array of strings.
|
||||
size_t i = 0;
|
||||
|
||||
VolnitskyToken searcher(pattern.data(), pattern.size(), end - pos);
|
||||
|
||||
/// We will search for the next occurrence in all rows at once.
|
||||
while (pos < end && end != (pos = searcher.search(pos, end - pos)))
|
||||
{
|
||||
/// Let's determine which index it refers to.
|
||||
while (begin + offsets[i] <= pos)
|
||||
{
|
||||
res[i] = negate_result;
|
||||
++i;
|
||||
}
|
||||
|
||||
/// We check that the entry does not pass through the boundaries of strings.
|
||||
if (pos + pattern.size() < begin + offsets[i])
|
||||
res[i] = !negate_result;
|
||||
else
|
||||
res[i] = negate_result;
|
||||
|
||||
pos = begin + offsets[i];
|
||||
++i;
|
||||
}
|
||||
|
||||
/// Tail, in which there can be no substring.
|
||||
if (i < res.size())
|
||||
memset(&res[i], negate_result, (res.size() - i) * sizeof(res[0]));
|
||||
}
|
||||
|
||||
static void constant_constant(const std::string & data, const std::string & pattern, UInt8 & res)
|
||||
{
|
||||
VolnitskyToken searcher(pattern.data(), pattern.size(), data.size());
|
||||
const auto found = searcher.search(data.c_str(), data.size()) != data.end().base();
|
||||
res = negate_result ^ found;
|
||||
}
|
||||
|
||||
template <typename... Args>
|
||||
static void vector_vector(Args &&...)
|
||||
{
|
||||
throw Exception("Function 'hasToken' does not support non-constant needle argument", ErrorCodes::ILLEGAL_COLUMN);
|
||||
}
|
||||
|
||||
/// Search different needles in single haystack.
|
||||
template <typename... Args>
|
||||
static void constant_vector(Args &&...)
|
||||
{
|
||||
throw Exception("Function 'hasToken' does not support non-constant needle argument", ErrorCodes::ILLEGAL_COLUMN);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
struct NamePosition
|
||||
{
|
||||
@ -516,6 +584,11 @@ struct NameMultiSearchFirstPositionCaseInsensitiveUTF8
|
||||
static constexpr auto name = "multiSearchFirstPositionCaseInsensitiveUTF8";
|
||||
};
|
||||
|
||||
struct NameHasToken
|
||||
{
|
||||
static constexpr auto name = "hasToken";
|
||||
};
|
||||
|
||||
|
||||
using FunctionPosition = FunctionsStringSearch<PositionImpl<PositionCaseSensitiveASCII>, NamePosition>;
|
||||
using FunctionPositionUTF8 = FunctionsStringSearch<PositionImpl<PositionCaseSensitiveUTF8>, NamePositionUTF8>;
|
||||
@ -542,6 +615,7 @@ using FunctionMultiSearchFirstPositionUTF8 = FunctionsMultiStringSearch<MultiSea
|
||||
using FunctionMultiSearchFirstPositionCaseInsensitive = FunctionsMultiStringSearch<MultiSearchFirstPositionImpl<PositionCaseInsensitiveASCII>, NameMultiSearchFirstPositionCaseInsensitive>;
|
||||
using FunctionMultiSearchFirstPositionCaseInsensitiveUTF8 = FunctionsMultiStringSearch<MultiSearchFirstPositionImpl<PositionCaseInsensitiveUTF8>, NameMultiSearchFirstPositionCaseInsensitiveUTF8>;
|
||||
|
||||
using FunctionHasToken = FunctionsStringSearch<HasTokenImpl<false>, NameHasToken>;
|
||||
|
||||
void registerFunctionsStringSearch(FunctionFactory & factory)
|
||||
{
|
||||
@ -570,6 +644,8 @@ void registerFunctionsStringSearch(FunctionFactory & factory)
|
||||
factory.registerFunction<FunctionMultiSearchFirstPositionCaseInsensitive>();
|
||||
factory.registerFunction<FunctionMultiSearchFirstPositionCaseInsensitiveUTF8>();
|
||||
|
||||
factory.registerFunction<FunctionHasToken>();
|
||||
|
||||
factory.registerAlias("locate", NamePosition::name, FunctionFactory::CaseInsensitive);
|
||||
}
|
||||
}
|
||||
|
@ -18,7 +18,7 @@ const UInt8 geohash_base32_decode_lookup_table[256] = {
|
||||
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
|
||||
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
|
||||
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
|
||||
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
|
||||
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
|
||||
0xFF, 0xFF, 10, 11, 12, 13, 14, 15, 16, 0xFF, 17, 18, 0xFF, 19, 20, 0xFF,
|
||||
|
@ -1,10 +1,8 @@
|
||||
#include <Columns/ColumnString.h>
|
||||
#include <Functions/FunctionFactory.h>
|
||||
#include <Functions/FunctionStringToString.h>
|
||||
#include <common/find_symbols.h>
|
||||
|
||||
#ifdef __SSE4_2__
|
||||
#include <nmmintrin.h>
|
||||
#endif
|
||||
|
||||
namespace DB
|
||||
{
|
||||
@ -60,7 +58,7 @@ public:
|
||||
execute(reinterpret_cast<const UInt8 *>(&data[prev_offset]), offsets[i] - prev_offset - 1, start, length);
|
||||
|
||||
res_data.resize(res_data.size() + length + 1);
|
||||
memcpy(&res_data[res_offset], start, length);
|
||||
memcpySmallAllowReadWriteOverflow15(&res_data[res_offset], start, length);
|
||||
res_offset += length + 1;
|
||||
res_data[res_offset - 1] = '\0';
|
||||
|
||||
@ -77,59 +75,27 @@ public:
|
||||
private:
|
||||
static void execute(const UInt8 * data, size_t size, const UInt8 *& res_data, size_t & res_size)
|
||||
{
|
||||
size_t chars_to_trim_left = 0;
|
||||
size_t chars_to_trim_right = 0;
|
||||
char whitespace = ' ';
|
||||
#ifdef __SSE4_2__
|
||||
const auto bytes_sse = sizeof(__m128i);
|
||||
const auto size_sse = size - (size % bytes_sse);
|
||||
const auto whitespace_mask = _mm_set1_epi8(whitespace);
|
||||
constexpr auto base_sse_mode = _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_EACH | _SIDD_NEGATIVE_POLARITY;
|
||||
auto mask = bytes_sse;
|
||||
#endif
|
||||
const char * char_data = reinterpret_cast<const char *>(data);
|
||||
const char * char_end = char_data + size;
|
||||
|
||||
if constexpr (mode::trim_left)
|
||||
{
|
||||
#ifdef __SSE4_2__
|
||||
/// skip whitespace from left in blocks of up to 16 characters
|
||||
|
||||
/// Avoid gcc bug: _mm_cmpistri: error: the third argument must be an 8-bit immediate
|
||||
enum { left_sse_mode = base_sse_mode | _SIDD_LEAST_SIGNIFICANT };
|
||||
while (mask == bytes_sse && chars_to_trim_left < size_sse)
|
||||
{
|
||||
const auto chars = _mm_loadu_si128(reinterpret_cast<const __m128i *>(data + chars_to_trim_left));
|
||||
mask = _mm_cmpistri(whitespace_mask, chars, left_sse_mode);
|
||||
chars_to_trim_left += mask;
|
||||
}
|
||||
#endif
|
||||
/// skip remaining whitespace from left, character by character
|
||||
while (chars_to_trim_left < size && data[chars_to_trim_left] == whitespace)
|
||||
++chars_to_trim_left;
|
||||
const char * found = find_first_not_symbols<' '>(char_data, char_end);
|
||||
size_t num_chars = found - char_data;
|
||||
char_data += num_chars;
|
||||
}
|
||||
|
||||
if constexpr (mode::trim_right)
|
||||
{
|
||||
const auto trim_right_size = size - chars_to_trim_left;
|
||||
#ifdef __SSE4_2__
|
||||
/// try to skip whitespace from right in blocks of up to 16 characters
|
||||
|
||||
/// Avoid gcc bug: _mm_cmpistri: error: the third argument must be an 8-bit immediate
|
||||
enum { right_sse_mode = base_sse_mode | _SIDD_MOST_SIGNIFICANT };
|
||||
const auto trim_right_size_sse = trim_right_size - (trim_right_size % bytes_sse);
|
||||
while (mask == bytes_sse && chars_to_trim_right < trim_right_size_sse)
|
||||
{
|
||||
const auto chars = _mm_loadu_si128(reinterpret_cast<const __m128i *>(data + size - chars_to_trim_right - bytes_sse));
|
||||
mask = _mm_cmpistri(whitespace_mask, chars, right_sse_mode);
|
||||
chars_to_trim_right += mask;
|
||||
}
|
||||
#endif
|
||||
/// skip remaining whitespace from right, character by character
|
||||
while (chars_to_trim_right < trim_right_size && data[size - chars_to_trim_right - 1] == whitespace)
|
||||
++chars_to_trim_right;
|
||||
const char * found = find_last_not_symbols_or_null<' '>(char_data, char_end);
|
||||
if (found)
|
||||
char_end = found + 1;
|
||||
else
|
||||
char_end = char_data;
|
||||
}
|
||||
|
||||
res_data = data + chars_to_trim_left;
|
||||
res_size = size - chars_to_trim_left - chars_to_trim_right;
|
||||
res_data = reinterpret_cast<const UInt8 *>(char_data);
|
||||
res_size = char_end - char_data;
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -40,7 +40,10 @@ HDFSBuilderPtr createHDFSBuilder(const Poco::URI & uri)
|
||||
hdfsBuilderSetUserName(builder.get(), user.c_str());
|
||||
}
|
||||
hdfsBuilderSetNameNode(builder.get(), host.c_str());
|
||||
hdfsBuilderSetNameNodePort(builder.get(), port);
|
||||
if (port != 0)
|
||||
{
|
||||
hdfsBuilderSetNameNodePort(builder.get(), port);
|
||||
}
|
||||
return builder;
|
||||
}
|
||||
|
||||
|
@ -12,7 +12,7 @@ namespace DB
|
||||
|
||||
ProtobufRowInputFormat::ProtobufRowInputFormat(ReadBuffer & in_, const Block & header_, Params params_, const FormatSchemaInfo & info_)
|
||||
: IRowInputFormat(header_, in_, params_)
|
||||
, data_types(header_.getDataTypes())
|
||||
, data_types(header_.getDataTypes())
|
||||
, reader(in, ProtobufSchemas::instance().getMessageTypeForFormatSchema(info_), header_.getNames())
|
||||
{
|
||||
}
|
||||
|
@ -168,6 +168,19 @@ const MergeTreeConditionFullText::AtomMap MergeTreeConditionFullText::atom_map
|
||||
return true;
|
||||
}
|
||||
},
|
||||
{
|
||||
"hasToken",
|
||||
[] (RPNElement & out, const Field & value, const MergeTreeIndexFullText & idx)
|
||||
{
|
||||
out.function = RPNElement::FUNCTION_EQUALS;
|
||||
out.bloom_filter = std::make_unique<BloomFilter>(
|
||||
idx.bloom_filter_size, idx.bloom_filter_hashes, idx.seed);
|
||||
|
||||
const auto & str = value.get<String>();
|
||||
stringToBloomFilter(str.c_str(), str.size(), idx.token_extractor_func, *out.bloom_filter);
|
||||
return true;
|
||||
}
|
||||
},
|
||||
{
|
||||
"startsWith",
|
||||
[] (RPNElement & out, const Field & value, const MergeTreeIndexFullText & idx)
|
||||
|
@ -405,25 +405,6 @@ bool MergeTreeIndexConditionSet::operatorFromAST(ASTPtr & node) const
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool checkAtomName(const String & name)
|
||||
{
|
||||
static std::set<String> atoms = {
|
||||
"notEquals",
|
||||
"equals",
|
||||
"less",
|
||||
"greater",
|
||||
"lessOrEquals",
|
||||
"greaterOrEquals",
|
||||
"in",
|
||||
"notIn",
|
||||
"like",
|
||||
"startsWith",
|
||||
"endsWith",
|
||||
"multiSearchAny"
|
||||
};
|
||||
return atoms.find(name) != atoms.end();
|
||||
}
|
||||
|
||||
bool MergeTreeIndexConditionSet::checkASTUseless(const ASTPtr &node, bool atomic) const
|
||||
{
|
||||
if (const auto * func = node->as<ASTFunction>())
|
||||
@ -439,16 +420,14 @@ bool MergeTreeIndexConditionSet::checkASTUseless(const ASTPtr &node, bool atomic
|
||||
return checkASTUseless(args[0], atomic) || checkASTUseless(args[1], atomic);
|
||||
else if (func->name == "not")
|
||||
return checkASTUseless(args[0], atomic);
|
||||
else if (!atomic && checkAtomName(func->name))
|
||||
return checkASTUseless(node, true);
|
||||
else
|
||||
return std::any_of(args.begin(), args.end(),
|
||||
[this, &atomic](const auto & arg) { return checkASTUseless(arg, atomic); });
|
||||
[this](const auto & arg) { return checkASTUseless(arg, true); });
|
||||
}
|
||||
else if (const auto * literal = node->as<ASTLiteral>())
|
||||
return !atomic && literal->value.get<bool>();
|
||||
else if (const auto * identifier = node->as<ASTIdentifier>())
|
||||
return key_columns.find(identifier->getColumnName()) == key_columns.end();
|
||||
return key_columns.find(identifier->getColumnName()) == std::end(key_columns);
|
||||
else
|
||||
return true;
|
||||
}
|
||||
|
@ -108,7 +108,7 @@ size_t MergeTreeRangeReader::DelayedStream::finalize(Block & block)
|
||||
|
||||
|
||||
MergeTreeRangeReader::Stream::Stream(
|
||||
size_t from_mark, size_t to_mark, MergeTreeReader * merge_tree_reader_)
|
||||
size_t from_mark, size_t to_mark, MergeTreeReader * merge_tree_reader_)
|
||||
: current_mark(from_mark), offset_after_current_mark(0)
|
||||
, last_mark(to_mark)
|
||||
, merge_tree_reader(merge_tree_reader_)
|
||||
|
@ -41,7 +41,7 @@ public:
|
||||
{
|
||||
public:
|
||||
DelayedStream() = default;
|
||||
DelayedStream(size_t from_mark, MergeTreeReader * merge_tree_reader);
|
||||
DelayedStream(size_t from_mark, MergeTreeReader * merge_tree_reader);
|
||||
|
||||
/// Read @num_rows rows from @from_mark starting from @offset row
|
||||
/// Returns the number of rows added to block.
|
||||
|
@ -146,7 +146,7 @@ BlockInputStreams StorageSystemNumbers::read(
|
||||
res[i] = std::make_shared<NumbersBlockInputStream>(max_block_size, offset + i * max_block_size, num_streams * max_block_size);
|
||||
|
||||
if (limit) /// This formula is how to split 'limit' elements to 'num_streams' chunks almost uniformly.
|
||||
res[i] = std::make_shared<LimitBlockInputStream>(res[i], *limit * (i + 1) / num_streams - *limit * i / num_streams, 0, false, true);
|
||||
res[i] = std::make_shared<LimitBlockInputStream>(res[i], *limit * (i + 1) / num_streams - *limit * i / num_streams, 0, false, true);
|
||||
}
|
||||
|
||||
return res;
|
||||
|
@ -2,6 +2,5 @@
|
||||
1
|
||||
1
|
||||
1
|
||||
finished 42 readonly SELECT 2, count() FROM system.numbers
|
||||
1
|
||||
44
|
||||
|
@ -30,7 +30,7 @@ ${CLICKHOUSE_CLIENT} --query_id=42 --query='SELECT 43' 2>&1 | grep -cF 'is alrea
|
||||
# Trying to replace query of a different user
|
||||
$CLICKHOUSE_CURL -sS "$CLICKHOUSE_URL?query_id=42&replace_running_query=1" -d 'SELECT 1' | grep -cF 'is already running by user'
|
||||
|
||||
$CLICKHOUSE_CURL -sS "$CLICKHOUSE_URL" -d "KILL QUERY WHERE query_id = '42' SYNC"
|
||||
$CLICKHOUSE_CURL -sS "$CLICKHOUSE_URL" -d "KILL QUERY WHERE query_id = '42' SYNC" > /dev/null
|
||||
wait
|
||||
|
||||
${CLICKHOUSE_CLIENT} --query_id=42 --query='SELECT 3, count() FROM system.numbers' 2>&1 | grep -cF 'was cancelled' &
|
||||
|
@ -2,6 +2,7 @@
|
||||
|
||||
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
|
||||
. $CURDIR/../shell_config.sh
|
||||
. $CURDIR/mergetree_mutations.lib
|
||||
|
||||
$CLICKHOUSE_CLIENT --query="DROP TABLE IF EXISTS test.minmax_idx;"
|
||||
|
||||
@ -35,8 +36,7 @@ $CLICKHOUSE_CLIENT --query="SELECT count() FROM test.minmax_idx WHERE i64 = 1;"
|
||||
$CLICKHOUSE_CLIENT --query="SELECT count() FROM test.minmax_idx WHERE i64 = 5;"
|
||||
|
||||
$CLICKHOUSE_CLIENT --query="ALTER TABLE test.minmax_idx UPDATE i64 = 5 WHERE i64 = 1;"
|
||||
|
||||
sleep 0.1
|
||||
wait_for_mutation "minmax_idx" "mutation_2.txt" "test"
|
||||
|
||||
$CLICKHOUSE_CLIENT --query="SELECT count() FROM test.minmax_idx WHERE i64 = 1;"
|
||||
$CLICKHOUSE_CLIENT --query="SELECT count() FROM test.minmax_idx WHERE i64 = 5;"
|
||||
|
@ -2,6 +2,7 @@
|
||||
|
||||
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
|
||||
. $CURDIR/../shell_config.sh
|
||||
. $CURDIR/mergetree_mutations.lib
|
||||
|
||||
$CLICKHOUSE_CLIENT --query="DROP TABLE IF EXISTS test.minmax_idx;"
|
||||
|
||||
@ -38,22 +39,26 @@ SET allow_experimental_data_skipping_indices=1;
|
||||
ALTER TABLE test.minmax_idx ADD INDEX idx (i64, u64 * i64) TYPE minmax GRANULARITY 1;"
|
||||
|
||||
$CLICKHOUSE_CLIENT --query="ALTER TABLE test.minmax_idx MATERIALIZE INDEX idx IN PARTITION 1;"
|
||||
wait_for_mutation "minmax_idx" "mutation_3.txt" "test"
|
||||
|
||||
$CLICKHOUSE_CLIENT --query="SELECT count() FROM test.minmax_idx WHERE i64 = 2;"
|
||||
$CLICKHOUSE_CLIENT --query="SELECT count() FROM test.minmax_idx WHERE i64 = 2 FORMAT JSON" | grep "rows_read"
|
||||
|
||||
$CLICKHOUSE_CLIENT --query="ALTER TABLE test.minmax_idx MATERIALIZE INDEX idx IN PARTITION 2;"
|
||||
wait_for_mutation "minmax_idx" "mutation_4.txt" "test"
|
||||
|
||||
$CLICKHOUSE_CLIENT --query="SELECT count() FROM test.minmax_idx WHERE i64 = 2;"
|
||||
$CLICKHOUSE_CLIENT --query="SELECT count() FROM test.minmax_idx WHERE i64 = 2 FORMAT JSON" | grep "rows_read"
|
||||
|
||||
$CLICKHOUSE_CLIENT --query="ALTER TABLE test.minmax_idx CLEAR INDEX idx IN PARTITION 1;"
|
||||
$CLICKHOUSE_CLIENT --query="ALTER TABLE test.minmax_idx CLEAR INDEX idx IN PARTITION 2;"
|
||||
sleep 0.5
|
||||
|
||||
$CLICKHOUSE_CLIENT --query="SELECT count() FROM test.minmax_idx WHERE i64 = 2;"
|
||||
$CLICKHOUSE_CLIENT --query="SELECT count() FROM test.minmax_idx WHERE i64 = 2 FORMAT JSON" | grep "rows_read"
|
||||
|
||||
$CLICKHOUSE_CLIENT --query="ALTER TABLE test.minmax_idx MATERIALIZE INDEX idx;"
|
||||
wait_for_mutation "minmax_idx" "mutation_5.txt" "test"
|
||||
|
||||
$CLICKHOUSE_CLIENT --query="SELECT count() FROM test.minmax_idx WHERE i64 = 2;"
|
||||
$CLICKHOUSE_CLIENT --query="SELECT count() FROM test.minmax_idx WHERE i64 = 2 FORMAT JSON" | grep "rows_read"
|
||||
|
@ -2,6 +2,7 @@
|
||||
|
||||
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
|
||||
. $CURDIR/../shell_config.sh
|
||||
. $CURDIR/mergetree_mutations.lib
|
||||
|
||||
$CLICKHOUSE_CLIENT --query="DROP TABLE IF EXISTS test.minmax_idx;"
|
||||
|
||||
@ -42,7 +43,7 @@ $CLICKHOUSE_CLIENT --query="SELECT count() FROM test.minmax_idx WHERE i64 = 2;"
|
||||
$CLICKHOUSE_CLIENT --query="SELECT count() FROM test.minmax_idx WHERE i64 = 2 FORMAT JSON" | grep "rows_read"
|
||||
|
||||
$CLICKHOUSE_CLIENT --query="ALTER TABLE test.minmax_idx MATERIALIZE INDEX idx IN PARTITION 1;"
|
||||
sleep 0.5
|
||||
wait_for_mutation "minmax_idx" "mutation_3.txt" "test"
|
||||
|
||||
$CLICKHOUSE_CLIENT --query="SELECT count() FROM test.minmax_idx WHERE i64 = 2;"
|
||||
$CLICKHOUSE_CLIENT --query="SELECT count() FROM test.minmax_idx WHERE i64 = 2 FORMAT JSON" | grep "rows_read"
|
||||
|
@ -2,6 +2,7 @@
|
||||
|
||||
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
|
||||
. $CURDIR/../shell_config.sh
|
||||
. $CURDIR/mergetree_mutations.lib
|
||||
|
||||
$CLICKHOUSE_CLIENT --query="DROP TABLE IF EXISTS test.indices_mutaions1;"
|
||||
$CLICKHOUSE_CLIENT --query="DROP TABLE IF EXISTS test.indices_mutaions2;"
|
||||
@ -44,19 +45,20 @@ $CLICKHOUSE_CLIENT --query="INSERT INTO test.indices_mutaions1 VALUES
|
||||
(9, 1, 2)"
|
||||
|
||||
$CLICKHOUSE_CLIENT --query="SELECT count() FROM test.indices_mutaions2 WHERE i64 = 2;"
|
||||
$CLICKHOUSE_CLIENT --query="SELECT count() FROM test.indices_mutaions2 WHERE i64 = 2 FORMAT JSON" | grep "rows_read"
|
||||
$CLICKHOUSE_CLIENT --query="SELECT count() FROM test.indices_mutaions2 WHERE i64 = 2 FORMAT JSON;" | grep "rows_read"
|
||||
|
||||
$CLICKHOUSE_CLIENT --query="ALTER TABLE test.indices_mutaions1 CLEAR INDEX idx IN PARTITION 1;"
|
||||
sleep 1
|
||||
|
||||
$CLICKHOUSE_CLIENT --query="SELECT count() FROM test.indices_mutaions2 WHERE i64 = 2;"
|
||||
$CLICKHOUSE_CLIENT --query="SELECT count() FROM test.indices_mutaions2 WHERE i64 = 2 FORMAT JSON" | grep "rows_read"
|
||||
$CLICKHOUSE_CLIENT --query="SELECT count() FROM test.indices_mutaions2 WHERE i64 = 2 FORMAT JSON;" | grep "rows_read"
|
||||
|
||||
$CLICKHOUSE_CLIENT --query="ALTER TABLE test.indices_mutaions1 MATERIALIZE INDEX idx IN PARTITION 1;"
|
||||
sleep 1
|
||||
wait_for_mutation "indices_mutaions1" "0000000000" "test"
|
||||
wait_for_mutation "indices_mutaions2" "0000000000" "test"
|
||||
|
||||
$CLICKHOUSE_CLIENT --query="SELECT count() FROM test.indices_mutaions2 WHERE i64 = 2;"
|
||||
$CLICKHOUSE_CLIENT --query="SELECT count() FROM test.indices_mutaions2 WHERE i64 = 2 FORMAT JSON" | grep "rows_read"
|
||||
$CLICKHOUSE_CLIENT --query="SELECT count() FROM test.indices_mutaions2 WHERE i64 = 2 FORMAT JSON;" | grep "rows_read"
|
||||
|
||||
$CLICKHOUSE_CLIENT --query="DROP TABLE test.indices_mutaions1"
|
||||
$CLICKHOUSE_CLIENT --query="DROP TABLE test.indices_mutaions2"
|
||||
|
@ -1,5 +1,7 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
CLICKHOUSE_CURL_TIMEOUT=30
|
||||
|
||||
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
|
||||
. $CURDIR/../shell_config.sh
|
||||
|
||||
|
124
dbms/tests/queries/0_stateless/00990_hasToken.python
Executable file
124
dbms/tests/queries/0_stateless/00990_hasToken.python
Executable file
@ -0,0 +1,124 @@
|
||||
#!/usr/bin/env python
|
||||
# encoding: utf-8
|
||||
|
||||
import re
|
||||
|
||||
HAYSTACKS = [
|
||||
"hay hay hay hay hay hay hay hay hay hay hay hay hay hay hay hay hay hay needle",
|
||||
"needle hay hay hay hay hay hay hay hay hay hay hay hay hay hay hay hay hay hay",
|
||||
"hay hay hay hay hay hay hay hay hay needle hay hay hay hay hay hay hay hay hay",
|
||||
]
|
||||
|
||||
NEEDLE = "needle"
|
||||
|
||||
HAY_RE = re.compile(r'\bhay\b', re.IGNORECASE)
|
||||
NEEDLE_RE = re.compile(r'\bneedle\b', re.IGNORECASE)
|
||||
|
||||
def replace_follow_case(replacement):
|
||||
def func(match):
|
||||
g = match.group()
|
||||
if g.islower(): return replacement.lower()
|
||||
if g.istitle(): return replacement.title()
|
||||
if g.isupper(): return replacement.upper()
|
||||
return replacement
|
||||
return func
|
||||
|
||||
def replace_separators(query, new_sep):
|
||||
SEP_RE = re.compile('\\s+')
|
||||
result = SEP_RE.sub(new_sep, query)
|
||||
return result
|
||||
|
||||
def enlarge_haystack(query, times, separator=''):
|
||||
return HAY_RE.sub(replace_follow_case(('hay' + separator) * times), query)
|
||||
|
||||
def small_needle(query):
|
||||
return NEEDLE_RE.sub(replace_follow_case('n'), query)
|
||||
|
||||
def remove_needle(query):
|
||||
return NEEDLE_RE.sub('', query)
|
||||
|
||||
def replace_needle(query, new_needle):
|
||||
return NEEDLE_RE.sub(new_needle, query)
|
||||
|
||||
# with str.lower, str.uppert, str.title and such
|
||||
def transform_needle(query, string_transformation_func):
|
||||
def replace_with_transformation(match):
|
||||
g = match.group()
|
||||
return string_transformation_func(g)
|
||||
|
||||
return NEEDLE_RE.sub(replace_with_transformation, query)
|
||||
|
||||
|
||||
def create_cases(table_row_template, table_query_template, const_query_template):
|
||||
const_queries = []
|
||||
table_rows = []
|
||||
table_queries = set()
|
||||
|
||||
def add_case(haystack, needle, match):
|
||||
match = int(match)
|
||||
const_queries.append(const_query_template.format(haystack=haystack, needle=needle, match=match))
|
||||
table_queries.add(table_query_template.format(haystack=haystack, needle=needle, match=match))
|
||||
table_rows.append(table_row_template.format(haystack=haystack, needle=needle, match=match))
|
||||
|
||||
# Negative cases
|
||||
add_case(remove_needle(HAYSTACKS[0]), NEEDLE, False)
|
||||
for haystack in HAYSTACKS:
|
||||
add_case(transform_needle(haystack, str.title), NEEDLE, False)
|
||||
sep = ''
|
||||
h = replace_separators(haystack, sep)
|
||||
add_case(h, NEEDLE, False)
|
||||
add_case(small_needle(h), small_needle(NEEDLE), False)
|
||||
add_case(enlarge_haystack(h, 10, sep), NEEDLE, False)
|
||||
|
||||
# positive cases
|
||||
for haystack in HAYSTACKS:
|
||||
add_case(transform_needle(haystack, str.title), transform_needle(NEEDLE, str.title), True)
|
||||
add_case(transform_needle(haystack, str.upper), transform_needle(NEEDLE, str.upper), True)
|
||||
|
||||
# Not checking all separators since some (like ' and \n) cause issues when coupled with
|
||||
# re-based replacement and quoting in query
|
||||
# other are rare in practice and checking all separators makes this test too lengthy.
|
||||
|
||||
# r'\\\\' turns into a single '\' in query
|
||||
#separators = list(''' \t`~!@#$%^&*()-=+|]}[{";:/?.>,<''') + [r'\\\\']
|
||||
separators = list(''' \t;:?.,''') + [r'\\\\']
|
||||
for sep in separators:
|
||||
h = replace_separators(haystack, sep)
|
||||
add_case(h, NEEDLE, True)
|
||||
add_case(small_needle(h), small_needle(NEEDLE), True)
|
||||
add_case(enlarge_haystack(h, 200, sep), NEEDLE, True)
|
||||
add_case(replace_needle(h, 'иголка'), replace_needle(NEEDLE, 'иголка'), True)
|
||||
add_case(replace_needle(h, '指针'), replace_needle(NEEDLE, '指针'), True)
|
||||
|
||||
return table_rows, table_queries, const_queries
|
||||
|
||||
def main():
|
||||
|
||||
def query(x):
|
||||
print x
|
||||
|
||||
CONST_QUERY = """SELECT hasToken('{haystack}', '{needle}'), ' expecting ', {match};"""
|
||||
#SELECT hasToken(haystack, '{needle}') FROM ht WHERE needle = '{needle}' AND match = {match};"""
|
||||
TABLE_QUERY = """WITH '{needle}' as n SELECT haystack, needle, hasToken(haystack, n) as result FROM ht WHERE needle = n AND result != match;"""
|
||||
TABLE_ROW = """('{haystack}', '{needle}', {match})"""
|
||||
|
||||
rows, table_queries, const_queries = create_cases(TABLE_ROW, TABLE_QUERY, CONST_QUERY)
|
||||
for q in const_queries:
|
||||
query(q)
|
||||
|
||||
query("""DROP TABLE IF EXISTS ht;
|
||||
CREATE TABLE IF NOT EXISTS
|
||||
ht
|
||||
(
|
||||
haystack String,
|
||||
needle String,
|
||||
match UInt8
|
||||
)
|
||||
ENGINE MergeTree()
|
||||
ORDER BY haystack;
|
||||
INSERT INTO ht VALUES {values};""".format(values=", ".join(rows)))
|
||||
for q in sorted(table_queries):
|
||||
query(q)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
139
dbms/tests/queries/0_stateless/00990_hasToken.reference
Normal file
139
dbms/tests/queries/0_stateless/00990_hasToken.reference
Normal file
@ -0,0 +1,139 @@
|
||||
0 expecting 0
|
||||
0 expecting 0
|
||||
0 expecting 0
|
||||
0 expecting 0
|
||||
0 expecting 0
|
||||
0 expecting 0
|
||||
0 expecting 0
|
||||
0 expecting 0
|
||||
0 expecting 0
|
||||
0 expecting 0
|
||||
0 expecting 0
|
||||
0 expecting 0
|
||||
0 expecting 0
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
||||
1 expecting 1
|
8
dbms/tests/queries/0_stateless/00990_hasToken.sh
Executable file
8
dbms/tests/queries/0_stateless/00990_hasToken.sh
Executable file
@ -0,0 +1,8 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
|
||||
. $CURDIR/../shell_config.sh
|
||||
|
||||
# We should have correct env vars from shell_config.sh to run this test
|
||||
|
||||
python $CURDIR/00990_hasToken.python | ${CLICKHOUSE_CLIENT} -nm
|
@ -0,0 +1,3 @@
|
||||
2007
|
||||
2007
|
||||
2007
|
@ -0,0 +1,33 @@
|
||||
SET allow_experimental_data_skipping_indices = 1;
|
||||
|
||||
DROP TABLE IF EXISTS bloom_filter;
|
||||
|
||||
CREATE TABLE bloom_filter
|
||||
(
|
||||
id UInt64,
|
||||
s String,
|
||||
INDEX tok_bf (s, lower(s)) TYPE tokenbf_v1(512, 3, 0) GRANULARITY 1
|
||||
) ENGINE = MergeTree() ORDER BY id SETTINGS index_granularity = 8;
|
||||
|
||||
insert into bloom_filter select number, 'yyy,uuu' from numbers(1024);
|
||||
insert into bloom_filter select number+2000, 'abc,def,zzz' from numbers(8);
|
||||
insert into bloom_filter select number+3000, 'yyy,uuu' from numbers(1024);
|
||||
insert into bloom_filter select number+3000, 'abcdefzzz' from numbers(1024);
|
||||
|
||||
set max_rows_to_read = 16;
|
||||
|
||||
SELECT max(id) FROM bloom_filter WHERE hasToken(s, 'abc');
|
||||
SELECT max(id) FROM bloom_filter WHERE hasToken(s, 'def');
|
||||
SELECT max(id) FROM bloom_filter WHERE hasToken(s, 'zzz');
|
||||
|
||||
-- invert result
|
||||
-- this does not work as expected, reading more rows that it should
|
||||
-- SELECT max(id) FROM bloom_filter WHERE NOT hasToken(s, 'yyy');
|
||||
|
||||
-- accessing to many rows
|
||||
SELECT max(id) FROM bloom_filter WHERE hasToken(s, 'yyy'); -- { serverError 158 }
|
||||
|
||||
-- this syntax is not supported by tokenbf
|
||||
SELECT max(id) FROM bloom_filter WHERE hasToken(s, 'zzz') == 1; -- { serverError 158 }
|
||||
|
||||
DROP TABLE bloom_filter;
|
@ -0,0 +1 @@
|
||||
['9']
|
@ -0,0 +1 @@
|
||||
SELECT extractAll('Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.143 YaBrowser/19.7.2.455 Yowser/2.5 Safari/537.36', '[Y][a-zA-Z]{8}/[1-9]([1-9]+)?(((.?)([0-9]+)?){0,4})?');
|
@ -0,0 +1 @@
|
||||
1
|
24
dbms/tests/queries/0_stateless/00997_set_index_array.sql
Normal file
24
dbms/tests/queries/0_stateless/00997_set_index_array.sql
Normal file
@ -0,0 +1,24 @@
|
||||
SET allow_experimental_data_skipping_indices = 1;
|
||||
|
||||
DROP TABLE IF EXISTS test.set_array;
|
||||
|
||||
CREATE TABLE test.set_array
|
||||
(
|
||||
primary_key String,
|
||||
index_array Array(UInt64),
|
||||
INDEX additional_index_array (index_array) TYPE set(10000) GRANULARITY 1
|
||||
) ENGINE = MergeTree()
|
||||
ORDER BY (primary_key);
|
||||
|
||||
INSERT INTO test.set_array
|
||||
select
|
||||
toString(intDiv(number, 1000000)) as primary_key,
|
||||
array(number) as index_array
|
||||
from system.numbers
|
||||
limit 10000000;
|
||||
|
||||
SET max_rows_to_read = 8192;
|
||||
|
||||
select count() from test.set_array where has(index_array, 333);
|
||||
|
||||
DROP TABLE test.set_array;
|
0
dbms/tests/queries/0_stateless/00997_trim.reference
Normal file
0
dbms/tests/queries/0_stateless/00997_trim.reference
Normal file
20
dbms/tests/queries/0_stateless/00997_trim.sql
Normal file
20
dbms/tests/queries/0_stateless/00997_trim.sql
Normal file
@ -0,0 +1,20 @@
|
||||
WITH
|
||||
'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ' AS x,
|
||||
replaceRegexpAll(x, '.', ' ') AS spaces,
|
||||
concat(substring(spaces, 1, rand(1) % 62), substring(x, 1, rand(2) % 62), substring(spaces, 1, rand(3) % 62)) AS s,
|
||||
trimLeft(s) AS sl,
|
||||
trimRight(s) AS sr,
|
||||
trimBoth(s) AS t,
|
||||
replaceRegexpOne(s, '^ +', '') AS slr,
|
||||
replaceRegexpOne(s, ' +$', '') AS srr,
|
||||
replaceRegexpOne(s, '^ *(.*?) *$', '\\1') AS tr
|
||||
SELECT
|
||||
replaceAll(s, ' ', '_'),
|
||||
replaceAll(sl, ' ', '_'),
|
||||
replaceAll(slr, ' ', '_'),
|
||||
replaceAll(sr, ' ', '_'),
|
||||
replaceAll(srr, ' ', '_'),
|
||||
replaceAll(t, ' ', '_'),
|
||||
replaceAll(tr, ' ', '_')
|
||||
FROM numbers(100000)
|
||||
WHERE NOT ((sl = slr) AND (sr = srr) AND (t = tr))
|
@ -46,7 +46,8 @@ export CLICKHOUSE_PORT_INTERSERVER=${CLICKHOUSE_PORT_INTERSERVER:="9009"}
|
||||
export CLICKHOUSE_URL_INTERSERVER=${CLICKHOUSE_URL_INTERSERVER:="${CLICKHOUSE_PORT_HTTP_PROTO}://${CLICKHOUSE_HOST}:${CLICKHOUSE_PORT_INTERSERVER}/"}
|
||||
|
||||
export CLICKHOUSE_CURL_COMMAND=${CLICKHOUSE_CURL_COMMAND:="curl"}
|
||||
export CLICKHOUSE_CURL=${CLICKHOUSE_CURL:="${CLICKHOUSE_CURL_COMMAND} --max-time 10"}
|
||||
export CLICKHOUSE_CURL_TIMEOUT=${CLICKHOUSE_CURL_TIMEOUT:="10"}
|
||||
export CLICKHOUSE_CURL=${CLICKHOUSE_CURL:="${CLICKHOUSE_CURL_COMMAND} --max-time ${CLICKHOUSE_CURL_TIMEOUT}"}
|
||||
export CLICKHOUSE_TMP=${CLICKHOUSE_TMP:="."}
|
||||
mkdir -p ${CLICKHOUSE_TMP}
|
||||
|
||||
|
@ -79,7 +79,7 @@ Enables or disables execution of `GROUP BY` clauses in external memory. See [GRO
|
||||
|
||||
Possible values:
|
||||
|
||||
- Maximum volume or RAM (in bytes) that can be used by the single [GROUP BY](../../query_language/select.md#select-group-by-clause) operation.
|
||||
- Maximum volume of RAM (in bytes) that can be used by the single [GROUP BY](../../query_language/select.md#select-group-by-clause) operation.
|
||||
- 0 — `GROUP BY` in external memory disabled.
|
||||
|
||||
Default value: 0.
|
||||
|
@ -193,18 +193,21 @@ RENAME TABLE [db11.]name11 TO [db12.]name12, [db21.]name21 TO [db22.]name22, ...
|
||||
|
||||
Все таблицы переименовываются под глобальной блокировкой. Переименовывание таблицы является лёгкой операцией. Если вы указали после TO другую базу данных, то таблица будет перенесена в эту базу данных. При этом, директории с базами данных должны быть расположены в одной файловой системе (иначе возвращается ошибка).
|
||||
|
||||
## SET
|
||||
## SET {#query-set}
|
||||
|
||||
```sql
|
||||
SET param = value
|
||||
```
|
||||
|
||||
Позволяет установить настройку `param` в значение `value`. Также можно одним запросом установить все настройки из заданного профиля настроек. Для этого укажите 'profile' в качестве имени настройки. Подробнее смотрите в разделе "Настройки".
|
||||
Настройка устанавливается на сессию, или на сервер (глобально), если указано `GLOBAL`.
|
||||
При установке глобальных настроек, эти настройки не применяются к уже запущенной сессии, включая текущую сессию. Она будет использована только для новых сессий.
|
||||
Устанавливает значение `value` для настройки `param` в текущей сессии. [Конфигурационные параметры сервера](../operations/server_settings/index.md) нельзя изменить подобным образом.
|
||||
|
||||
При перезапуске сервера теряются настройки, установленные с помощью `SET`.
|
||||
Установить настройки, которые переживут перезапуск сервера, можно только с помощью конфигурационного файла сервера.
|
||||
Можно одним запросом установить все настройки из заданного профиля настроек.
|
||||
|
||||
```sql
|
||||
SET profile = 'profile-name-from-the-settings-file'
|
||||
```
|
||||
|
||||
Подробности смотрите в разделе [Настройки](../operations/settings/settings.md).
|
||||
|
||||
## SHOW CREATE TABLE
|
||||
|
||||
|
@ -1 +0,0 @@
|
||||
../../en/database_engines/index.md
|
11
docs/zh/database_engines/index.md
Normal file
11
docs/zh/database_engines/index.md
Normal file
@ -0,0 +1,11 @@
|
||||
# 数据库引擎
|
||||
|
||||
您使用的所有表都是由数据库引擎所提供的
|
||||
|
||||
默认情况下,ClickHouse使用自己的数据库引擎,该引擎提供可配置的[表引擎](../operations/table_engines/index.md)和[所有支持的SQL语法](../query_language/syntax.md).
|
||||
|
||||
除此之外,您还可以选择使用以下的数据库引擎:
|
||||
|
||||
- [MySQL](mysql.md)
|
||||
|
||||
[来源文章](https://clickhouse.yandex/docs/en/database_engines/) <!--hide-->
|
@ -1 +0,0 @@
|
||||
../../en/database_engines/mysql.md
|
124
docs/zh/database_engines/mysql.md
Normal file
124
docs/zh/database_engines/mysql.md
Normal file
@ -0,0 +1,124 @@
|
||||
# MySQL
|
||||
|
||||
MySQL引擎用于将远程的MySQL服务器中的表映射到ClickHouse中,并允许您对表进行`INSERT`和`SELECT`查询,以方便您在ClickHouse与MySQL之间进行数据交换。
|
||||
|
||||
`MySQL`数据库引擎会将对其的查询转换为MySQL语法并发送到MySQL服务器中,因此您可以执行诸如`SHOW TABLES`或`SHOW CREATE TABLE`之类的操作。
|
||||
|
||||
但您无法对其执行以下操作:
|
||||
|
||||
- `ATTACH`/`DETACH`
|
||||
- `DROP`
|
||||
- `RENAME`
|
||||
- `CREATE TABLE`
|
||||
- `ALTER`
|
||||
|
||||
|
||||
## CREATE DATABASE
|
||||
|
||||
``` sql
|
||||
CREATE DATABASE [IF NOT EXISTS] db_name [ON CLUSTER cluster]
|
||||
ENGINE = MySQL('host:port', 'database', 'user', 'password')
|
||||
```
|
||||
|
||||
**MySQL数据库引擎参数**
|
||||
|
||||
- `host:port` — 链接的MySQL地址。
|
||||
- `database` — 链接的MySQL数据库。
|
||||
- `user` — 链接的MySQL用户。
|
||||
- `password` — 链接的MySQL用户密码。
|
||||
|
||||
|
||||
## 支持的类型对应
|
||||
|
||||
MySQL | ClickHouse
|
||||
------|------------
|
||||
UNSIGNED TINYINT | [UInt8](../data_types/int_uint.md)
|
||||
TINYINT | [Int8](../data_types/int_uint.md)
|
||||
UNSIGNED SMALLINT | [UInt16](../data_types/int_uint.md)
|
||||
SMALLINT | [Int16](../data_types/int_uint.md)
|
||||
UNSIGNED INT, UNSIGNED MEDIUMINT | [UInt32](../data_types/int_uint.md)
|
||||
INT, MEDIUMINT | [Int32](../data_types/int_uint.md)
|
||||
UNSIGNED BIGINT | [UInt64](../data_types/int_uint.md)
|
||||
BIGINT | [Int64](../data_types/int_uint.md)
|
||||
FLOAT | [Float32](../data_types/float.md)
|
||||
DOUBLE | [Float64](../data_types/float.md)
|
||||
DATE | [Date](../data_types/date.md)
|
||||
DATETIME, TIMESTAMP | [DateTime](../data_types/datetime.md)
|
||||
BINARY | [FixedString](../data_types/fixedstring.md)
|
||||
|
||||
其他的MySQL数据类型将全部都转换为[String](../data_types/string.md)。
|
||||
|
||||
同时以上的所有类型都支持[Nullable](../data_types/nullable.md)。
|
||||
|
||||
|
||||
## 使用示例
|
||||
|
||||
在MySQL中创建表:
|
||||
|
||||
```
|
||||
mysql> USE test;
|
||||
Database changed
|
||||
|
||||
mysql> CREATE TABLE `mysql_table` (
|
||||
-> `int_id` INT NOT NULL AUTO_INCREMENT,
|
||||
-> `float` FLOAT NOT NULL,
|
||||
-> PRIMARY KEY (`int_id`));
|
||||
Query OK, 0 rows affected (0,09 sec)
|
||||
|
||||
mysql> insert into mysql_table (`int_id`, `float`) VALUES (1,2);
|
||||
Query OK, 1 row affected (0,00 sec)
|
||||
|
||||
mysql> select * from mysql_table;
|
||||
+--------+-------+
|
||||
| int_id | value |
|
||||
+--------+-------+
|
||||
| 1 | 2 |
|
||||
+--------+-------+
|
||||
1 row in set (0,00 sec)
|
||||
```
|
||||
|
||||
在ClickHouse中创建MySQL类型的数据库,同时与MySQL服务器交换数据:
|
||||
|
||||
```sql
|
||||
CREATE DATABASE mysql_db ENGINE = MySQL('localhost:3306', 'test', 'my_user', 'user_password')
|
||||
```
|
||||
```sql
|
||||
SHOW DATABASES
|
||||
```
|
||||
```text
|
||||
┌─name─────┐
|
||||
│ default │
|
||||
│ mysql_db │
|
||||
│ system │
|
||||
└──────────┘
|
||||
```
|
||||
```sql
|
||||
SHOW TABLES FROM mysql_db
|
||||
```
|
||||
```text
|
||||
┌─name─────────┐
|
||||
│ mysql_table │
|
||||
└──────────────┘
|
||||
```
|
||||
```sql
|
||||
SELECT * FROM mysql_db.mysql_table
|
||||
```
|
||||
```text
|
||||
┌─int_id─┬─value─┐
|
||||
│ 1 │ 2 │
|
||||
└────────┴───────┘
|
||||
```
|
||||
```sql
|
||||
INSERT INTO mysql_db.mysql_table VALUES (3,4)
|
||||
```
|
||||
```sql
|
||||
SELECT * FROM mysql_db.mysql_table
|
||||
```
|
||||
```text
|
||||
┌─int_id─┬─value─┐
|
||||
│ 1 │ 2 │
|
||||
│ 3 │ 4 │
|
||||
└────────┴───────┘
|
||||
```
|
||||
|
||||
[来源文章](https://clickhouse.yandex/docs/en/database_engines/mysql/) <!--hide-->
|
@ -17,7 +17,7 @@
|
||||
* but with the following differencies:
|
||||
* - works with any memory ranges, including containing zero bytes;
|
||||
* - doesn't require terminating zero byte: end of memory range is passed explicitly;
|
||||
* - if not found, returns pointer to end instead of NULL;
|
||||
* - if not found, returns pointer to end instead of nullptr;
|
||||
* - maximum number of symbols to search is 16.
|
||||
*
|
||||
* Uses SSE 2 in case of small number of symbols for search and SSE 4.2 in the case of large number of symbols,
|
||||
@ -65,115 +65,154 @@ inline __m128i mm_is_in(__m128i bytes)
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
template <char... symbols>
|
||||
inline const char * find_first_symbols_sse2(const char * begin, const char * end)
|
||||
template <bool positive>
|
||||
bool maybe_negate(bool x)
|
||||
{
|
||||
if constexpr (positive)
|
||||
return x;
|
||||
else
|
||||
return !x;
|
||||
}
|
||||
|
||||
template <bool positive>
|
||||
uint16_t maybe_negate(uint16_t x)
|
||||
{
|
||||
if constexpr (positive)
|
||||
return x;
|
||||
else
|
||||
return ~x;
|
||||
}
|
||||
|
||||
enum class ReturnMode
|
||||
{
|
||||
End,
|
||||
Nullptr,
|
||||
};
|
||||
|
||||
|
||||
template <bool positive, ReturnMode return_mode, char... symbols>
|
||||
inline const char * find_first_symbols_sse2(const char * const begin, const char * const end)
|
||||
{
|
||||
const char * pos = begin;
|
||||
|
||||
#if defined(__SSE2__)
|
||||
for (; begin + 15 < end; begin += 16)
|
||||
for (; pos + 15 < end; pos += 16)
|
||||
{
|
||||
__m128i bytes = _mm_loadu_si128(reinterpret_cast<const __m128i *>(begin));
|
||||
__m128i bytes = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pos));
|
||||
|
||||
__m128i eq = mm_is_in<symbols...>(bytes);
|
||||
|
||||
uint16_t bit_mask = _mm_movemask_epi8(eq);
|
||||
uint16_t bit_mask = maybe_negate<positive>(uint16_t(_mm_movemask_epi8(eq)));
|
||||
if (bit_mask)
|
||||
return begin + __builtin_ctz(bit_mask);
|
||||
return pos + __builtin_ctz(bit_mask);
|
||||
}
|
||||
#endif
|
||||
|
||||
for (; begin < end; ++begin)
|
||||
if (is_in<symbols...>(*begin))
|
||||
return begin;
|
||||
return end;
|
||||
for (; pos < end; ++pos)
|
||||
if (maybe_negate<positive>(is_in<symbols...>(*pos)))
|
||||
return pos;
|
||||
|
||||
return return_mode == ReturnMode::End ? end : nullptr;
|
||||
}
|
||||
|
||||
|
||||
template <char... symbols>
|
||||
inline const char * find_last_symbols_or_null_sse2(const char * begin, const char * end)
|
||||
template <bool positive, ReturnMode return_mode, char... symbols>
|
||||
inline const char * find_last_symbols_sse2(const char * const begin, const char * const end)
|
||||
{
|
||||
const char * pos = end;
|
||||
|
||||
#if defined(__SSE2__)
|
||||
for (; end - 16 >= begin; end -= 16) /// Assuming the pointer cannot overflow. Assuming we can compare these pointers.
|
||||
for (; pos - 16 >= begin; pos -= 16) /// Assuming the pointer cannot overflow. Assuming we can compare these pointers.
|
||||
{
|
||||
__m128i bytes = _mm_loadu_si128(reinterpret_cast<const __m128i *>(end - 16));
|
||||
__m128i bytes = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pos - 16));
|
||||
|
||||
__m128i eq = mm_is_in<symbols...>(bytes);
|
||||
|
||||
uint16_t bit_mask = _mm_movemask_epi8(eq);
|
||||
uint16_t bit_mask = maybe_negate<positive>(uint16_t(_mm_movemask_epi8(eq)));
|
||||
if (bit_mask)
|
||||
return end - 1 - (__builtin_clz(bit_mask) - 16); /// because __builtin_clz works with mask as uint32.
|
||||
return pos - 1 - (__builtin_clz(bit_mask) - 16); /// because __builtin_clz works with mask as uint32.
|
||||
}
|
||||
#endif
|
||||
|
||||
--end;
|
||||
for (; end >= begin; --end)
|
||||
if (is_in<symbols...>(*end))
|
||||
return end;
|
||||
--pos;
|
||||
for (; pos >= begin; --pos)
|
||||
if (maybe_negate<positive>(is_in<symbols...>(*pos)))
|
||||
return pos;
|
||||
|
||||
return nullptr;
|
||||
return return_mode == ReturnMode::End ? end : nullptr;
|
||||
}
|
||||
|
||||
|
||||
template <size_t num_chars,
|
||||
template <bool positive, ReturnMode return_mode, size_t num_chars,
|
||||
char c01, char c02 = 0, char c03 = 0, char c04 = 0,
|
||||
char c05 = 0, char c06 = 0, char c07 = 0, char c08 = 0,
|
||||
char c09 = 0, char c10 = 0, char c11 = 0, char c12 = 0,
|
||||
char c13 = 0, char c14 = 0, char c15 = 0, char c16 = 0>
|
||||
inline const char * find_first_symbols_sse42_impl(const char * begin, const char * end)
|
||||
inline const char * find_first_symbols_sse42_impl(const char * const begin, const char * const end)
|
||||
{
|
||||
const char * pos = begin;
|
||||
|
||||
#if defined(__SSE4_2__)
|
||||
#define MODE (_SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT)
|
||||
__m128i set = _mm_setr_epi8(c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, c14, c15, c16);
|
||||
|
||||
for (; begin + 15 < end; begin += 16)
|
||||
for (; pos + 15 < end; pos += 16)
|
||||
{
|
||||
__m128i bytes = _mm_loadu_si128(reinterpret_cast<const __m128i *>(begin));
|
||||
__m128i bytes = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pos));
|
||||
|
||||
if (_mm_cmpestrc(set, num_chars, bytes, 16, MODE))
|
||||
return begin + _mm_cmpestri(set, num_chars, bytes, 16, MODE);
|
||||
if constexpr (positive)
|
||||
{
|
||||
if (_mm_cmpestrc(set, num_chars, bytes, 16, MODE))
|
||||
return pos + _mm_cmpestri(set, num_chars, bytes, 16, MODE);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (_mm_cmpestrc(set, num_chars, bytes, 16, MODE | _SIDD_NEGATIVE_POLARITY))
|
||||
return pos + _mm_cmpestri(set, num_chars, bytes, 16, MODE | _SIDD_NEGATIVE_POLARITY);
|
||||
}
|
||||
}
|
||||
#undef MODE
|
||||
#endif
|
||||
|
||||
for (; begin < end; ++begin)
|
||||
if ( (num_chars >= 1 && *begin == c01)
|
||||
|| (num_chars >= 2 && *begin == c02)
|
||||
|| (num_chars >= 3 && *begin == c03)
|
||||
|| (num_chars >= 4 && *begin == c04)
|
||||
|| (num_chars >= 5 && *begin == c05)
|
||||
|| (num_chars >= 6 && *begin == c06)
|
||||
|| (num_chars >= 7 && *begin == c07)
|
||||
|| (num_chars >= 8 && *begin == c08)
|
||||
|| (num_chars >= 9 && *begin == c09)
|
||||
|| (num_chars >= 10 && *begin == c10)
|
||||
|| (num_chars >= 11 && *begin == c11)
|
||||
|| (num_chars >= 12 && *begin == c12)
|
||||
|| (num_chars >= 13 && *begin == c13)
|
||||
|| (num_chars >= 14 && *begin == c14)
|
||||
|| (num_chars >= 15 && *begin == c15)
|
||||
|| (num_chars >= 16 && *begin == c16))
|
||||
return begin;
|
||||
return end;
|
||||
for (; pos < end; ++pos)
|
||||
if ( (num_chars >= 1 && maybe_negate<positive>(*pos == c01))
|
||||
|| (num_chars >= 2 && maybe_negate<positive>(*pos == c02))
|
||||
|| (num_chars >= 3 && maybe_negate<positive>(*pos == c03))
|
||||
|| (num_chars >= 4 && maybe_negate<positive>(*pos == c04))
|
||||
|| (num_chars >= 5 && maybe_negate<positive>(*pos == c05))
|
||||
|| (num_chars >= 6 && maybe_negate<positive>(*pos == c06))
|
||||
|| (num_chars >= 7 && maybe_negate<positive>(*pos == c07))
|
||||
|| (num_chars >= 8 && maybe_negate<positive>(*pos == c08))
|
||||
|| (num_chars >= 9 && maybe_negate<positive>(*pos == c09))
|
||||
|| (num_chars >= 10 && maybe_negate<positive>(*pos == c10))
|
||||
|| (num_chars >= 11 && maybe_negate<positive>(*pos == c11))
|
||||
|| (num_chars >= 12 && maybe_negate<positive>(*pos == c12))
|
||||
|| (num_chars >= 13 && maybe_negate<positive>(*pos == c13))
|
||||
|| (num_chars >= 14 && maybe_negate<positive>(*pos == c14))
|
||||
|| (num_chars >= 15 && maybe_negate<positive>(*pos == c15))
|
||||
|| (num_chars >= 16 && maybe_negate<positive>(*pos == c16)))
|
||||
return pos;
|
||||
return return_mode == ReturnMode::End ? end : nullptr;
|
||||
}
|
||||
|
||||
|
||||
template <char... symbols>
|
||||
template <bool positive, ReturnMode return_mode, char... symbols>
|
||||
inline const char * find_first_symbols_sse42(const char * begin, const char * end)
|
||||
{
|
||||
return find_first_symbols_sse42_impl<sizeof...(symbols), symbols...>(begin, end);
|
||||
return find_first_symbols_sse42_impl<positive, return_mode, sizeof...(symbols), symbols...>(begin, end);
|
||||
}
|
||||
|
||||
/// NOTE No SSE 4.2 implementation for find_last_symbols_or_null. Not worth to do.
|
||||
|
||||
template <char... symbols>
|
||||
template <bool positive, ReturnMode return_mode, char... symbols>
|
||||
inline const char * find_first_symbols_dispatch(const char * begin, const char * end)
|
||||
{
|
||||
#if defined(__SSE4_2__)
|
||||
if (sizeof...(symbols) >= 5)
|
||||
return find_first_symbols_sse42<symbols...>(begin, end);
|
||||
return find_first_symbols_sse42<positive, return_mode, symbols...>(begin, end);
|
||||
else
|
||||
#endif
|
||||
return find_first_symbols_sse2<symbols...>(begin, end);
|
||||
return find_first_symbols_sse2<positive, return_mode, symbols...>(begin, end);
|
||||
}
|
||||
|
||||
}
|
||||
@ -182,7 +221,7 @@ inline const char * find_first_symbols_dispatch(const char * begin, const char *
|
||||
template <char... symbols>
|
||||
inline const char * find_first_symbols(const char * begin, const char * end)
|
||||
{
|
||||
return detail::find_first_symbols_dispatch<symbols...>(begin, end);
|
||||
return detail::find_first_symbols_dispatch<true, detail::ReturnMode::End, symbols...>(begin, end);
|
||||
}
|
||||
|
||||
/// Returning non const result for non const arguments.
|
||||
@ -190,18 +229,66 @@ inline const char * find_first_symbols(const char * begin, const char * end)
|
||||
template <char... symbols>
|
||||
inline char * find_first_symbols(char * begin, char * end)
|
||||
{
|
||||
return const_cast<char *>(detail::find_first_symbols_dispatch<symbols...>(begin, end));
|
||||
return const_cast<char *>(detail::find_first_symbols_dispatch<true, detail::ReturnMode::End, symbols...>(begin, end));
|
||||
}
|
||||
|
||||
template <char... symbols>
|
||||
inline const char * find_first_not_symbols(const char * begin, const char * end)
|
||||
{
|
||||
return detail::find_first_symbols_dispatch<false, detail::ReturnMode::End, symbols...>(begin, end);
|
||||
}
|
||||
|
||||
template <char... symbols>
|
||||
inline char * find_first_not_symbols(char * begin, char * end)
|
||||
{
|
||||
return const_cast<char *>(detail::find_first_symbols_dispatch<false, detail::ReturnMode::End, symbols...>(begin, end));
|
||||
}
|
||||
|
||||
template <char... symbols>
|
||||
inline const char * find_first_symbols_or_null(const char * begin, const char * end)
|
||||
{
|
||||
return detail::find_first_symbols_dispatch<true, detail::ReturnMode::Nullptr, symbols...>(begin, end);
|
||||
}
|
||||
|
||||
template <char... symbols>
|
||||
inline char * find_first_symbols_or_null(char * begin, char * end)
|
||||
{
|
||||
return const_cast<char *>(detail::find_first_symbols_dispatch<true, detail::ReturnMode::Nullptr, symbols...>(begin, end));
|
||||
}
|
||||
|
||||
template <char... symbols>
|
||||
inline const char * find_first_not_symbols_or_null(const char * begin, const char * end)
|
||||
{
|
||||
return detail::find_first_symbols_dispatch<false, detail::ReturnMode::Nullptr, symbols...>(begin, end);
|
||||
}
|
||||
|
||||
template <char... symbols>
|
||||
inline char * find_first_not_symbols_or_null(char * begin, char * end)
|
||||
{
|
||||
return const_cast<char *>(detail::find_first_symbols_dispatch<false, detail::ReturnMode::Nullptr, symbols...>(begin, end));
|
||||
}
|
||||
|
||||
|
||||
template <char... symbols>
|
||||
inline const char * find_last_symbols_or_null(const char * begin, const char * end)
|
||||
{
|
||||
return detail::find_last_symbols_or_null_sse2<symbols...>(begin, end);
|
||||
return detail::find_last_symbols_sse2<true, detail::ReturnMode::Nullptr, symbols...>(begin, end);
|
||||
}
|
||||
|
||||
template <char... symbols>
|
||||
inline char * find_last_symbols_or_null(char * begin, char * end)
|
||||
{
|
||||
return const_cast<char *>(detail::find_last_symbols_or_null_sse2<symbols...>(begin, end));
|
||||
return const_cast<char *>(detail::find_last_symbols_sse2<true, detail::ReturnMode::Nullptr, symbols...>(begin, end));
|
||||
}
|
||||
|
||||
template <char... symbols>
|
||||
inline const char * find_last_not_symbols_or_null(const char * begin, const char * end)
|
||||
{
|
||||
return detail::find_last_symbols_sse2<false, detail::ReturnMode::Nullptr, symbols...>(begin, end);
|
||||
}
|
||||
|
||||
template <char... symbols>
|
||||
inline char * find_last_not_symbols_or_null(char * begin, char * end)
|
||||
{
|
||||
return const_cast<char *>(detail::find_last_symbols_sse2<false, detail::ReturnMode::Nullptr, symbols...>(begin, end));
|
||||
}
|
||||
|
@ -25,3 +25,9 @@ find $ROOT_PATH/dbms -name '*.h' -or -name '*.cpp' |
|
||||
find $ROOT_PATH/dbms -name '*.h' -or -name '*.cpp' |
|
||||
grep -vP 'Compiler|build' |
|
||||
xargs grep $@ -P '}\s*//+\s*namespace\s*'
|
||||
|
||||
# Broken symlinks
|
||||
find -L $ROOT_PATH -type l | grep -v contrib && echo "^ Broken symlinks found"
|
||||
|
||||
# Double whitespaces
|
||||
find $ROOT_PATH/dbms -name '*.h' -or -name '*.cpp' | while read i; do $ROOT_PATH/utils/check-style/double-whitespaces.pl < $i || echo -e "^ File $i contains double whitespaces\n"; done
|
||||
|
33
utils/check-style/double-whitespaces.pl
Executable file
33
utils/check-style/double-whitespaces.pl
Executable file
@ -0,0 +1,33 @@
|
||||
#!/usr/bin/perl
|
||||
|
||||
use strict;
|
||||
|
||||
# Find double whitespace such as "a, b, c" that looks very ugly and annoying.
|
||||
# But skip double whitespaces if they are used as an alignment - by comparing to surrounding lines.
|
||||
|
||||
my @array;
|
||||
|
||||
while (<>)
|
||||
{
|
||||
push @array, $_;
|
||||
}
|
||||
|
||||
my $ret = 0;
|
||||
|
||||
for (my $i = 1; $i < $#array; ++$i)
|
||||
{
|
||||
if ($array[$i] =~ ',( {2,3})[^ /]')
|
||||
{
|
||||
# https://stackoverflow.com/questions/87380/how-can-i-find-the-location-of-a-regex-match-in-perl
|
||||
|
||||
if ((substr($array[$i - 1], $+[1] - 1, 2) !~ /^[ -][^ ]$/) # whitespaces are not part of alignment
|
||||
&& (substr($array[$i + 1], $+[1] - 1, 2) !~ /^[ -][^ ]$/)
|
||||
&& $array[$i] !~ /(-?\d+\w*,\s+){3,}/) # this is not a number table like { 10, -1, 2 }
|
||||
{
|
||||
print(($i + 1) . ":" . $array[$i]);
|
||||
$ret = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
exit $ret;
|
Loading…
Reference in New Issue
Block a user