Merge branch 'master' into kafka_zstd

This commit is contained in:
Odin Hultgren Van Der Horst 2019-01-18 08:19:30 +01:00
commit 0c4ace4f8f
105 changed files with 26991 additions and 1096 deletions

View File

@ -1,7 +1,7 @@
# Use Ninja instead of Unix Makefiles by default.
# https://stackoverflow.com/questions/11269833/cmake-selecting-a-generator-within-cmakelists-txt
#
# Reason: it have better startup time than make and it parallelize jobs more uniformly.
# Reason: it has better startup time than make and it parallelizes jobs more uniformly.
# (when comparing to make with Makefiles that was generated by CMake)
#
# How to install Ninja on Ubuntu:

View File

@ -2,10 +2,10 @@
set(VERSION_REVISION 54413)
set(VERSION_MAJOR 19)
set(VERSION_MINOR 1)
set(VERSION_PATCH 0)
set(VERSION_GITHASH 014e344a36bc19a58621e0add379984cf62b9067)
set(VERSION_DESCRIBE v19.1.0-testing)
set(VERSION_STRING 19.1.0)
set(VERSION_PATCH 1)
set(VERSION_GITHASH 4e7747117123f5a1b027a64865844b4faa10447d)
set(VERSION_DESCRIBE v19.1.1-testing)
set(VERSION_STRING 19.1.1)
# end of autochange
set(VERSION_EXTRA "" CACHE STRING "")

View File

@ -235,6 +235,11 @@ private:
actions.clear();
actions.emplace_back(PatternActionType::KleeneStar);
dfa_states.clear();
dfa_states.emplace_back(true);
pattern_has_time = false;
const char * pos = pattern.data();
const char * begin = pos;
const char * end = pos + pattern.size();
@ -285,6 +290,7 @@ private:
actions.back().type != PatternActionType::KleeneStar)
throw Exception{"Temporal condition should be preceeded by an event condition", ErrorCodes::BAD_ARGUMENTS};
pattern_has_time = true;
actions.emplace_back(type, duration);
}
else
@ -299,6 +305,9 @@ private:
throw Exception{"Event number " + toString(event_number) + " is out of range", ErrorCodes::BAD_ARGUMENTS};
actions.emplace_back(PatternActionType::SpecificEvent, event_number - 1);
dfa_states.back().transition = DFATransition::SpecificEvent;
dfa_states.back().event = event_number - 1;
dfa_states.emplace_back();
}
if (!match(")"))
@ -306,17 +315,88 @@ private:
}
else if (match(".*"))
{
actions.emplace_back(PatternActionType::KleeneStar);
dfa_states.back().has_kleene = true;
}
else if (match("."))
{
actions.emplace_back(PatternActionType::AnyEvent);
dfa_states.back().transition = DFATransition::AnyEvent;
dfa_states.emplace_back();
}
else
throw_exception("Could not parse pattern, unexpected starting symbol");
}
}
protected:
/// Uses a DFA based approach in order to better handle patterns without
/// time assertions.
///
/// NOTE: This implementation relies on the assumption that the pattern are *small*.
///
/// This algorithm performs in O(mn) (with m the number of DFA states and N the number
/// of events) with a memory consumption and memory allocations in O(m). It means that
/// if n >>> m (which is expected to be the case), this algorithm can be considered linear.
template <typename T>
bool match(T & events_it, const T events_end) const
bool dfaMatch(T & events_it, const T events_end) const
{
using ActiveStates = std::vector<bool>;
/// Those two vectors keep track of which states should be considered for the current
/// event as well as the states which should be considered for the next event.
ActiveStates active_states(dfa_states.size(), false);
ActiveStates next_active_states(dfa_states.size(), false);
active_states[0] = true;
/// Keeps track of dead-ends in order not to iterate over all the events to realize that
/// the match failed.
size_t n_active = 1;
for (/* empty */; events_it != events_end && n_active > 0 && !active_states.back(); ++events_it)
{
n_active = 0;
next_active_states.assign(dfa_states.size(), false);
for (size_t state = 0; state < dfa_states.size(); ++state)
{
if (!active_states[state])
{
continue;
}
switch (dfa_states[state].transition)
{
case DFATransition::None:
break;
case DFATransition::AnyEvent:
next_active_states[state + 1] = true;
++n_active;
break;
case DFATransition::SpecificEvent:
if (events_it->second.test(dfa_states[state].event))
{
next_active_states[state + 1] = true;
++n_active;
}
break;
}
if (dfa_states[state].has_kleene)
{
next_active_states[state] = true;
++n_active;
}
}
swap(active_states, next_active_states);
}
return active_states.back();
}
template <typename T>
bool backtrackingMatch(T & events_it, const T events_end) const
{
const auto action_begin = std::begin(actions);
const auto action_end = std::end(actions);
@ -445,10 +525,53 @@ protected:
return action_it == action_end;
}
private:
enum class DFATransition : char
{
/// .-------.
/// | |
/// `-------'
None,
/// .-------. (?[0-9])
/// | | ----------
/// `-------'
SpecificEvent,
/// .-------. .
/// | | ----------
/// `-------'
AnyEvent,
};
struct DFAState
{
DFAState(bool has_kleene = false)
: has_kleene{has_kleene}, event{0}, transition{DFATransition::None}
{}
/// .-------.
/// | | - - -
/// `-------'
/// |_^
bool has_kleene;
/// In the case of a state transitions with a `SpecificEvent`,
/// `event` contains the value of the event.
uint32_t event;
/// The kind of transition out of this state.
DFATransition transition;
};
using DFAStates = std::vector<DFAState>;
protected:
/// `True` if the parsed pattern contains time assertions (?t...), `false` otherwise.
bool pattern_has_time;
private:
std::string pattern;
size_t arg_count;
PatternActions actions;
DFAStates dfa_states;
};
@ -471,7 +594,8 @@ public:
const auto events_end = std::end(data_ref.events_list);
auto events_it = events_begin;
static_cast<ColumnUInt8 &>(to).getData().push_back(match(events_it, events_end));
bool match = pattern_has_time ? backtrackingMatch(events_it, events_end) : dfaMatch(events_it, events_end);
static_cast<ColumnUInt8 &>(to).getData().push_back(match);
}
};
@ -501,7 +625,7 @@ private:
auto events_it = events_begin;
size_t count = 0;
while (events_it != events_end && match(events_it, events_end))
while (events_it != events_end && backtrackingMatch(events_it, events_end))
++count;
return count;

View File

@ -1,15 +1,17 @@
#pragma once
#include <Common/StringSearcher.h>
#include <Common/StringUtils/StringUtils.h>
#include <algorithm>
#include <vector>
#include <stdint.h>
#include <string.h>
#include <Columns/ColumnString.h>
#include <Core/Types.h>
#include <Poco/UTF8Encoding.h>
#include <Poco/Unicode.h>
#include <Common/StringSearcher.h>
#include <Common/StringUtils/StringUtils.h>
#include <common/StringRef.h>
#include <common/unaligned.h>
#include <ext/range.h>
#include <stdint.h>
#include <string.h>
/** Search for a substring in a string by Volnitsky's algorithm
* http://volnitsky.com/project/str_search/
@ -28,117 +30,38 @@
* - if it did not match, we check the next cell of the hash table from the collision resolution chain;
* - if not found, skip to haystack almost the size of the needle bytes;
*
* Unaligned memory access is used.
* MultiVolnitsky - search for multiple substrings in a string:
* - Add bigrams to hash table with string index. Then the usual Volnitsky search is used.
* - We are adding while searching, limiting the number of fallback searchers and the total number of added bigrams
*/
namespace DB
{
/// @todo store lowercase needle to speed up in case there are numerous occurrences of bigrams from needle in haystack
template <typename CRTP>
class VolnitskyBase
namespace VolnitskyTraits
{
protected:
using Offset = UInt8; /// Offset in the needle. For the basic algorithm, the length of the needle must not be greater than 255.
using Ngram = UInt16; /// n-gram (2 bytes).
using Offset = UInt8; /// Offset in the needle. For the basic algorithm, the length of the needle must not be greater than 255.
using Id = UInt8; /// Index of the string (within the array of multiple needles), must not be greater than 255.
using Ngram = UInt16; /// n-gram (2 bytes).
const UInt8 * const needle;
const size_t needle_size;
const UInt8 * const needle_end = needle + needle_size;
/// For how long we move, if the n-gram from haystack is not found in the hash table.
const size_t step = needle_size - sizeof(Ngram) + 1;
/** max needle length is 255, max distinct ngrams for case-sensitive is (255 - 1), case-insensitive is 4 * (255 - 1)
* storage of 64K ngrams (n = 2, 128 KB) should be large enough for both cases */
static const size_t hash_size = 64 * 1024; /// Fits into the L2 cache (of common Intel CPUs).
Offset hash[hash_size]; /// Hash table.
/** Fits into the L2 cache (of common Intel CPUs).
* This number is extremely good for compilers as it is numeric_limits<Uint16>::max() and there are optimizations with movzwl and other instructions with 2 bytes
*/
static constexpr size_t hash_size = 64 * 1024;
/// min haystack size to use main algorithm instead of fallback
static constexpr auto min_haystack_size_for_algorithm = 20000;
const bool fallback; /// Do we need to use the fallback algorithm.
static constexpr size_t min_haystack_size_for_algorithm = 20000;
public:
/** haystack_size_hint - the expected total size of the haystack for `search` calls. Optional (zero means unspecified).
* If you specify it small enough, the fallback algorithm will be used,
* since it is considered that it's useless to waste time initializing the hash table.
*/
VolnitskyBase(const char * const needle, const size_t needle_size, size_t haystack_size_hint = 0)
: needle{reinterpret_cast<const UInt8 *>(needle)}, needle_size{needle_size},
fallback{
needle_size < 2 * sizeof(Ngram)
|| needle_size >= std::numeric_limits<Offset>::max()
|| (haystack_size_hint && haystack_size_hint < min_haystack_size_for_algorithm)}
static inline bool isFallbackNeedle(const size_t needle_size, size_t haystack_size_hint = 0)
{
if (fallback)
return;
memset(hash, 0, sizeof(hash));
/// int is used here because unsigned can't be used with condition like `i >= 0`, unsigned always >= 0
for (auto i = static_cast<int>(needle_size - sizeof(Ngram)); i >= 0; --i)
self().putNGram(this->needle + i, i + 1, this->needle);
return needle_size < 2 * sizeof(Ngram) || needle_size >= std::numeric_limits<Offset>::max()
|| (haystack_size_hint && haystack_size_hint < min_haystack_size_for_algorithm);
}
static inline Ngram toNGram(const UInt8 * const pos) { return unalignedLoad<Ngram>(pos); }
/// If not found, the end of the haystack is returned.
const UInt8 * search(const UInt8 * const haystack, const size_t haystack_size) const
{
if (needle_size == 0)
return haystack;
const auto haystack_end = haystack + haystack_size;
if (needle_size == 1 || fallback || haystack_size <= needle_size)
return self().search_fallback(haystack, haystack_end);
/// Let's "apply" the needle to the haystack and compare the n-gram from the end of the needle.
const auto * pos = haystack + needle_size - sizeof(Ngram);
for (; pos <= haystack_end - needle_size; pos += step)
{
/// We look at all the cells of the hash table that can correspond to the n-gram from haystack.
for (size_t cell_num = toNGram(pos) % hash_size; hash[cell_num];
cell_num = (cell_num + 1) % hash_size)
{
/// When found - compare bytewise, using the offset from the hash table.
const auto res = pos - (hash[cell_num] - 1);
if (self().compare(res))
return res;
}
}
/// The remaining tail.
return self().search_fallback(pos - step + 1, haystack_end);
}
const char * search(const char * haystack, size_t haystack_size) const
{
return reinterpret_cast<const char *>(search(reinterpret_cast<const UInt8 *>(haystack), haystack_size));
}
protected:
CRTP & self() { return static_cast<CRTP &>(*this); }
const CRTP & self() const { return const_cast<VolnitskyBase *>(this)->self(); }
static Ngram toNGram(const UInt8 * const pos)
{
return unalignedLoad<Ngram>(pos);
}
void putNGramBase(const Ngram ngram, const int offset)
{
/// Put the offset for the n-gram in the corresponding cell or the nearest free cell.
size_t cell_num = ngram % hash_size;
while (hash[cell_num])
cell_num = (cell_num + 1) % hash_size; /// Search for the next free cell.
hash[cell_num] = offset;
}
void putNGramASCIICaseInsensitive(const UInt8 * const pos, const int offset)
template <typename Callback>
static inline void putNGramASCIICaseInsensitive(const UInt8 * const pos, const int offset, const Callback & putNGramBase)
{
struct Chars
{
@ -186,274 +109,583 @@ protected:
/// 1 combination: 01
putNGramBase(n, offset);
}
};
template <bool CaseSensitive, bool ASCII> struct VolnitskyImpl;
/// Case sensitive comparison
template <bool ASCII> struct VolnitskyImpl<true, ASCII> : VolnitskyBase<VolnitskyImpl<true, ASCII>>
{
VolnitskyImpl(const char * const needle_, const size_t needle_size_, const size_t haystack_size_hint = 0)
: VolnitskyBase<VolnitskyImpl<true, ASCII>>{needle_, needle_size_, haystack_size_hint},
fallback_searcher{needle_, needle_size_}
template <bool CaseSensitive, bool ASCII, typename Callback>
static inline void putNGram(const UInt8 * const pos, const int offset, [[maybe_unused]] const UInt8 * const begin, const Callback & putNGramBase)
{
}
void putNGram(const UInt8 * const pos, const int offset, const UInt8 * const /*begin*/)
{
this->putNGramBase(this->toNGram(pos), offset);
}
bool compare(const UInt8 * const pos) const
{
/// @todo: maybe just use memcmp for this case and rely on internal SSE optimization as in case with memcpy?
return fallback_searcher.compare(pos);
}
const UInt8 * search_fallback(const UInt8 * const haystack, const UInt8 * const haystack_end) const
{
return fallback_searcher.search(haystack, haystack_end);
}
ASCIICaseSensitiveStringSearcher fallback_searcher;
};
/// Case-insensitive ASCII
template <> struct VolnitskyImpl<false, true> : VolnitskyBase<VolnitskyImpl<false, true>>
{
VolnitskyImpl(const char * const needle_, const size_t needle_size_, const size_t haystack_size_hint = 0)
: VolnitskyBase{needle_, needle_size_, haystack_size_hint}, fallback_searcher{needle_, needle_size_}
{
}
void putNGram(const UInt8 * const pos, const int offset, const UInt8 * const /*begin*/)
{
putNGramASCIICaseInsensitive(pos, offset);
}
bool compare(const UInt8 * const pos) const
{
return fallback_searcher.compare(pos);
}
const UInt8 * search_fallback(const UInt8 * const haystack, const UInt8 * const haystack_end) const
{
return fallback_searcher.search(haystack, haystack_end);
}
ASCIICaseInsensitiveStringSearcher fallback_searcher;
};
/// Case-sensitive UTF-8
template <> struct VolnitskyImpl<false, false> : VolnitskyBase<VolnitskyImpl<false, false>>
{
VolnitskyImpl(const char * const needle_, const size_t needle_size_, const size_t haystack_size_hint = 0)
: VolnitskyBase{needle_, needle_size_, haystack_size_hint}, fallback_searcher{needle_, needle_size_}
{
}
void putNGram(const UInt8 * const pos, const int offset, const UInt8 * const begin)
{
struct Chars
if constexpr (CaseSensitive)
{
UInt8 c0;
UInt8 c1;
};
union
{
Ngram n;
Chars chars;
};
n = toNGram(pos);
if (isascii(chars.c0) && isascii(chars.c1))
{
putNGramASCIICaseInsensitive(pos, offset);
putNGramBase(toNGram(pos), offset);
}
else
{
/** n-gram (in the case of n = 2)
* can be entirely located within one code point,
* or intersect with two code points.
*
* In the first case, you need to consider up to two alternatives - this code point in upper and lower case,
* and in the second case - up to four alternatives - fragments of two code points in all combinations of cases.
*
* It does not take into account the dependence of the case-transformation from the locale (for example - Turkish `Ii`)
* as well as composition / decomposition and other features.
*
* It also does not work if characters with lower and upper cases are represented by different number of bytes or code points.
*/
using Seq = UInt8[6];
static const Poco::UTF8Encoding utf8;
if (UTF8::isContinuationOctet(chars.c1))
if constexpr (ASCII)
{
/// ngram is inside a sequence
auto seq_pos = pos;
UTF8::syncBackward(seq_pos, begin);
const auto u32 = utf8.convert(seq_pos);
const auto l_u32 = Poco::Unicode::toLower(u32);
const auto u_u32 = Poco::Unicode::toUpper(u32);
/// symbol is case-independent
if (l_u32 == u_u32)
putNGramBase(n, offset);
else
{
/// where is the given ngram in respect to the start of UTF-8 sequence?
const auto seq_ngram_offset = pos - seq_pos;
Seq seq;
/// put ngram for lowercase
utf8.convert(l_u32, seq, sizeof(seq));
chars.c0 = seq[seq_ngram_offset];
chars.c1 = seq[seq_ngram_offset + 1];
putNGramBase(n, offset);
/// put ngram for uppercase
utf8.convert(u_u32, seq, sizeof(seq));
chars.c0 = seq[seq_ngram_offset];
chars.c1 = seq[seq_ngram_offset + 1];
putNGramBase(n, offset);
}
putNGramASCIICaseInsensitive(pos, offset, putNGramBase);
}
else
{
/// ngram is on the boundary of two sequences
/// first sequence may start before u_pos if it is not ASCII
auto first_seq_pos = pos;
UTF8::syncBackward(first_seq_pos, begin);
/// where is the given ngram in respect to the start of first UTF-8 sequence?
const auto seq_ngram_offset = pos - first_seq_pos;
const auto first_u32 = utf8.convert(first_seq_pos);
const auto first_l_u32 = Poco::Unicode::toLower(first_u32);
const auto first_u_u32 = Poco::Unicode::toUpper(first_u32);
/// second sequence always start immediately after u_pos
auto second_seq_pos = pos + 1;
const auto second_u32 = utf8.convert(second_seq_pos); /// TODO This assumes valid UTF-8 or zero byte after needle.
const auto second_l_u32 = Poco::Unicode::toLower(second_u32);
const auto second_u_u32 = Poco::Unicode::toUpper(second_u32);
/// both symbols are case-independent
if (first_l_u32 == first_u_u32 && second_l_u32 == second_u_u32)
struct Chars
{
putNGramBase(n, offset);
}
else if (first_l_u32 == first_u_u32)
UInt8 c0;
UInt8 c1;
};
union
{
/// first symbol is case-independent
Seq seq;
VolnitskyTraits::Ngram n;
Chars chars;
};
/// put ngram for lowercase
utf8.convert(second_l_u32, seq, sizeof(seq));
chars.c1 = seq[0];
putNGramBase(n, offset);
n = toNGram(pos);
/// put ngram from uppercase, if it is different
utf8.convert(second_u_u32, seq, sizeof(seq));
if (chars.c1 != seq[0])
{
chars.c1 = seq[0];
putNGramBase(n, offset);
}
}
else if (second_l_u32 == second_u_u32)
{
/// second symbol is case-independent
Seq seq;
/// put ngram for lowercase
utf8.convert(first_l_u32, seq, sizeof(seq));
chars.c0 = seq[seq_ngram_offset];
putNGramBase(n, offset);
/// put ngram for uppercase, if it is different
utf8.convert(first_u_u32, seq, sizeof(seq));
if (chars.c0 != seq[seq_ngram_offset])
{
chars.c0 = seq[seq_ngram_offset];
putNGramBase(n, offset);
}
}
if (isascii(chars.c0) && isascii(chars.c1))
putNGramASCIICaseInsensitive(pos, offset, putNGramBase);
else
{
Seq first_l_seq;
Seq first_u_seq;
Seq second_l_seq;
Seq second_u_seq;
/** n-gram (in the case of n = 2)
* can be entirely located within one code point,
* or intersect with two code points.
*
* In the first case, you need to consider up to two alternatives - this code point in upper and lower case,
* and in the second case - up to four alternatives - fragments of two code points in all combinations of cases.
*
* It does not take into account the dependence of the case-transformation from the locale (for example - Turkish `Ii`)
* as well as composition / decomposition and other features.
*
* It also does not work if characters with lower and upper cases are represented by different number of bytes or code points.
*/
utf8.convert(first_l_u32, first_l_seq, sizeof(first_l_seq));
utf8.convert(first_u_u32, first_u_seq, sizeof(first_u_seq));
utf8.convert(second_l_u32, second_l_seq, sizeof(second_l_seq));
utf8.convert(second_u_u32, second_u_seq, sizeof(second_u_seq));
using Seq = UInt8[6];
auto c0l = first_l_seq[seq_ngram_offset];
auto c0u = first_u_seq[seq_ngram_offset];
auto c1l = second_l_seq[0];
auto c1u = second_u_seq[0];
static const Poco::UTF8Encoding utf8;
/// ngram for ll
chars.c0 = c0l;
chars.c1 = c1l;
putNGramBase(n, offset);
if (c0l != c0u)
if (UTF8::isContinuationOctet(chars.c1))
{
/// ngram for Ul
chars.c0 = c0u;
chars.c1 = c1l;
putNGramBase(n, offset);
/// ngram is inside a sequence
auto seq_pos = pos;
UTF8::syncBackward(seq_pos, begin);
const auto u32 = utf8.convert(seq_pos);
const auto l_u32 = Poco::Unicode::toLower(u32);
const auto u_u32 = Poco::Unicode::toUpper(u32);
/// symbol is case-independent
if (l_u32 == u_u32)
putNGramBase(n, offset);
else
{
/// where is the given ngram in respect to the start of UTF-8 sequence?
const auto seq_ngram_offset = pos - seq_pos;
Seq seq;
/// put ngram for lowercase
utf8.convert(l_u32, seq, sizeof(seq));
chars.c0 = seq[seq_ngram_offset];
chars.c1 = seq[seq_ngram_offset + 1];
putNGramBase(n, offset);
/// put ngram for uppercase
utf8.convert(u_u32, seq, sizeof(seq));
chars.c0 = seq[seq_ngram_offset];
chars.c1 = seq[seq_ngram_offset + 1];
putNGramBase(n, offset);
}
}
if (c1l != c1u)
else
{
/// ngram for lU
chars.c0 = c0l;
chars.c1 = c1u;
putNGramBase(n, offset);
}
/// ngram is on the boundary of two sequences
/// first sequence may start before u_pos if it is not ASCII
auto first_seq_pos = pos;
UTF8::syncBackward(first_seq_pos, begin);
/// where is the given ngram in respect to the start of first UTF-8 sequence?
const auto seq_ngram_offset = pos - first_seq_pos;
if (c0l != c0u && c1l != c1u)
{
/// ngram for UU
chars.c0 = c0u;
chars.c1 = c1u;
putNGramBase(n, offset);
const auto first_u32 = utf8.convert(first_seq_pos);
const auto first_l_u32 = Poco::Unicode::toLower(first_u32);
const auto first_u_u32 = Poco::Unicode::toUpper(first_u32);
/// second sequence always start immediately after u_pos
auto second_seq_pos = pos + 1;
const auto second_u32 = utf8.convert(second_seq_pos); /// TODO This assumes valid UTF-8 or zero byte after needle.
const auto second_l_u32 = Poco::Unicode::toLower(second_u32);
const auto second_u_u32 = Poco::Unicode::toUpper(second_u32);
/// both symbols are case-independent
if (first_l_u32 == first_u_u32 && second_l_u32 == second_u_u32)
{
putNGramBase(n, offset);
}
else if (first_l_u32 == first_u_u32)
{
/// first symbol is case-independent
Seq seq;
/// put ngram for lowercase
utf8.convert(second_l_u32, seq, sizeof(seq));
chars.c1 = seq[0];
putNGramBase(n, offset);
/// put ngram from uppercase, if it is different
utf8.convert(second_u_u32, seq, sizeof(seq));
if (chars.c1 != seq[0])
{
chars.c1 = seq[0];
putNGramBase(n, offset);
}
}
else if (second_l_u32 == second_u_u32)
{
/// second symbol is case-independent
Seq seq;
/// put ngram for lowercase
utf8.convert(first_l_u32, seq, sizeof(seq));
chars.c0 = seq[seq_ngram_offset];
putNGramBase(n, offset);
/// put ngram for uppercase, if it is different
utf8.convert(first_u_u32, seq, sizeof(seq));
if (chars.c0 != seq[seq_ngram_offset])
{
chars.c0 = seq[seq_ngram_offset];
putNGramBase(n, offset);
}
}
else
{
Seq first_l_seq;
Seq first_u_seq;
Seq second_l_seq;
Seq second_u_seq;
utf8.convert(first_l_u32, first_l_seq, sizeof(first_l_seq));
utf8.convert(first_u_u32, first_u_seq, sizeof(first_u_seq));
utf8.convert(second_l_u32, second_l_seq, sizeof(second_l_seq));
utf8.convert(second_u_u32, second_u_seq, sizeof(second_u_seq));
auto c0l = first_l_seq[seq_ngram_offset];
auto c0u = first_u_seq[seq_ngram_offset];
auto c1l = second_l_seq[0];
auto c1u = second_u_seq[0];
/// ngram for ll
chars.c0 = c0l;
chars.c1 = c1l;
putNGramBase(n, offset);
if (c0l != c0u)
{
/// ngram for Ul
chars.c0 = c0u;
chars.c1 = c1l;
putNGramBase(n, offset);
}
if (c1l != c1u)
{
/// ngram for lU
chars.c0 = c0l;
chars.c1 = c1u;
putNGramBase(n, offset);
}
if (c0l != c0u && c1l != c1u)
{
/// ngram for UU
chars.c0 = c0u;
chars.c1 = c1u;
putNGramBase(n, offset);
}
}
}
}
}
}
}
}
bool compare(const UInt8 * const pos) const
/// @todo store lowercase needle to speed up in case there are numerous occurrences of bigrams from needle in haystack
template <bool CaseSensitive, bool ASCII, typename FallbackSearcher>
class VolnitskyBase
{
protected:
const UInt8 * const needle;
const size_t needle_size;
const UInt8 * const needle_end = needle + needle_size;
/// For how long we move, if the n-gram from haystack is not found in the hash table.
const size_t step = needle_size - sizeof(VolnitskyTraits::Ngram) + 1;
/** max needle length is 255, max distinct ngrams for case-sensitive is (255 - 1), case-insensitive is 4 * (255 - 1)
* storage of 64K ngrams (n = 2, 128 KB) should be large enough for both cases */
VolnitskyTraits::Offset hash[VolnitskyTraits::hash_size]; /// Hash table.
const bool fallback; /// Do we need to use the fallback algorithm.
FallbackSearcher fallback_searcher;
public:
/** haystack_size_hint - the expected total size of the haystack for `search` calls. Optional (zero means unspecified).
* If you specify it small enough, the fallback algorithm will be used,
* since it is considered that it's useless to waste time initializing the hash table.
*/
VolnitskyBase(const char * const needle, const size_t needle_size, size_t haystack_size_hint = 0)
: needle{reinterpret_cast<const UInt8 *>(needle)}
, needle_size{needle_size}
, fallback{VolnitskyTraits::isFallbackNeedle(needle_size, haystack_size_hint)}
, fallback_searcher{needle, needle_size}
{
return fallback_searcher.compare(pos);
if (fallback)
return;
memset(hash, 0, sizeof(hash));
auto callback = [this](const VolnitskyTraits::Ngram ngram, const int offset) { return this->putNGramBase(ngram, offset); };
/// ssize_t is used here because unsigned can't be used with condition like `i >= 0`, unsigned always >= 0
for (auto i = static_cast<ssize_t>(needle_size - sizeof(VolnitskyTraits::Ngram)); i >= 0; --i)
VolnitskyTraits::putNGram<CaseSensitive, ASCII>(this->needle + i, i + 1, this->needle, callback);
}
const UInt8 * search_fallback(const UInt8 * const haystack, const UInt8 * const haystack_end) const
/// If not found, the end of the haystack is returned.
const UInt8 * search(const UInt8 * const haystack, const size_t haystack_size) const
{
return fallback_searcher.search(haystack, haystack_end);
if (needle_size == 0)
return haystack;
const auto haystack_end = haystack + haystack_size;
if (fallback || haystack_size <= needle_size)
return fallback_searcher.search(haystack, haystack_end);
/// Let's "apply" the needle to the haystack and compare the n-gram from the end of the needle.
const auto * pos = haystack + needle_size - sizeof(VolnitskyTraits::Ngram);
for (; pos <= haystack_end - needle_size; pos += step)
{
/// We look at all the cells of the hash table that can correspond to the n-gram from haystack.
for (size_t cell_num = VolnitskyTraits::toNGram(pos) % VolnitskyTraits::hash_size; hash[cell_num];
cell_num = (cell_num + 1) % VolnitskyTraits::hash_size)
{
/// When found - compare bytewise, using the offset from the hash table.
const auto res = pos - (hash[cell_num] - 1);
/// pointer in the code is always padded array so we can use pagesafe semantics
if (fallback_searcher.compare(res))
return res;
}
}
return fallback_searcher.search(pos - step + 1, haystack_end);
}
UTF8CaseInsensitiveStringSearcher fallback_searcher;
const char * search(const char * haystack, size_t haystack_size) const
{
return reinterpret_cast<const char *>(search(reinterpret_cast<const UInt8 *>(haystack), haystack_size));
}
protected:
void putNGramBase(const VolnitskyTraits::Ngram ngram, const int offset)
{
/// Put the offset for the n-gram in the corresponding cell or the nearest free cell.
size_t cell_num = ngram % VolnitskyTraits::hash_size;
while (hash[cell_num])
cell_num = (cell_num + 1) % VolnitskyTraits::hash_size; /// Search for the next free cell.
hash[cell_num] = offset;
}
};
using Volnitsky = VolnitskyImpl<true, true>;
using VolnitskyUTF8 = VolnitskyImpl<true, false>; /// exactly same as Volnitsky
using VolnitskyCaseInsensitive = VolnitskyImpl<false, true>; /// ignores non-ASCII bytes
using VolnitskyCaseInsensitiveUTF8 = VolnitskyImpl<false, false>;
template <bool CaseSensitive, bool ASCII, typename FallbackSearcher>
class MultiVolnitskyBase
{
private:
/// needles and their offsets
const std::vector<StringRef> & needles;
/// fallback searchers
std::vector<size_t> fallback_needles;
std::vector<FallbackSearcher> fallback_searchers;
/// because std::pair<> is not POD
struct OffsetId
{
VolnitskyTraits::Id id;
VolnitskyTraits::Offset off;
};
OffsetId hash[VolnitskyTraits::hash_size];
/// step for each bunch of strings
size_t step;
/// last index of offsets that was not processed
size_t last;
/// limit for adding to hashtable. In worst case with case insentive search, the table will be filled at most as half
static constexpr size_t small_limit = VolnitskyTraits::hash_size / 8;
public:
MultiVolnitskyBase(const std::vector<StringRef> & needles_) : needles{needles_}, step{0}, last{0}
{
fallback_searchers.reserve(needles.size());
}
template <typename ResultType, typename AnsCallback>
void searchAll(
const ColumnString::Chars & haystack_data,
const ColumnString::Offsets & haystack_offsets,
const AnsCallback & ansCallback,
ResultType & ans)
{
const size_t haystack_string_size = haystack_offsets.size();
const size_t needles_size = needles.size();
/// something can be uninitialized after
std::fill(ans.begin(), ans.end(), 0);
while (!reset())
{
size_t fallback_size = fallback_needles.size();
size_t prev_offset = 0;
for (size_t j = 0, from = 0; j < haystack_string_size; ++j, from += needles_size)
{
const auto * haystack = &haystack_data[prev_offset];
const auto * haystack_end = haystack + haystack_offsets[j] - prev_offset - 1;
for (size_t i = 0; i < fallback_size; ++i)
{
const UInt8 * ptr = fallback_searchers[fallback_needles[i]].search(haystack, haystack_end);
if (ptr != haystack_end)
ans[from + fallback_needles[i]] = ansCallback(haystack, ptr);
}
/// check if we have one non empty volnitsky searcher
if (step != std::numeric_limits<size_t>::max())
{
const auto * pos = haystack + step - sizeof(VolnitskyTraits::Ngram);
for (; pos <= haystack_end - sizeof(VolnitskyTraits::Ngram); pos += step)
{
for (size_t cell_num = VolnitskyTraits::toNGram(pos) % VolnitskyTraits::hash_size; hash[cell_num].off;
cell_num = (cell_num + 1) % VolnitskyTraits::hash_size)
{
if (pos >= haystack + hash[cell_num].off - 1)
{
const auto * res = pos - (hash[cell_num].off - 1);
const size_t ind = hash[cell_num].id;
if (ans[from + ind] == 0 && res + needles[ind].size <= haystack_end)
{
if (fallback_searchers[ind].compare(res))
{
ans[from + ind] = ansCallback(haystack, res);
}
}
}
}
}
}
prev_offset = haystack_offsets[j];
}
}
}
template <typename ResultType>
void search(const ColumnString::Chars & haystack_data, const ColumnString::Offsets & haystack_offsets, ResultType & ans)
{
auto callback = [this](const UInt8 * haystack, const UInt8 * haystack_end) -> bool
{
return this->searchOne(haystack, haystack_end);
};
searchInternal(haystack_data, haystack_offsets, callback, ans);
}
template <typename ResultType>
void searchIndex(const ColumnString::Chars & haystack_data, const ColumnString::Offsets & haystack_offsets, ResultType & ans)
{
auto callback = [this](const UInt8 * haystack, const UInt8 * haystack_end) -> size_t
{
return this->searchOneIndex(haystack, haystack_end);
};
searchInternal(haystack_data, haystack_offsets, callback, ans);
}
private:
/**
* This function is needed to initialize hash table
* Returns `true` if there is nothing to initialize
* and `false` if we have something to initialize and initializes it.
* This function is a kind of fallback if there are many needles.
* We actually destroy the hash table and initialize it with uninitialized needles
* and search through the haystack again.
* The actual usage of this function is like this:
* while (!reset())
* {
* search inside the haystack with the known needles
* }
*/
bool reset()
{
if (last == needles.size())
return true;
memset(hash, 0, sizeof(hash));
fallback_needles.clear();
step = std::numeric_limits<size_t>::max();
size_t buf = 0;
size_t size = needles.size();
for (; last < size; ++last)
{
const char * cur_needle_data = needles[last].data;
const size_t cur_needle_size = needles[last].size;
/// save the indices of fallback searchers
if (VolnitskyTraits::isFallbackNeedle(cur_needle_size))
{
fallback_needles.push_back(last);
}
else
{
/// put all bigrams
auto callback = [this](const VolnitskyTraits::Ngram ngram, const int offset)
{
return this->putNGramBase(ngram, offset, this->last);
};
buf += cur_needle_size - sizeof(VolnitskyTraits::Ngram) + 1;
/// this is the condition when we actually need to stop and start searching with known needles
if (buf > small_limit)
break;
step = std::min(step, cur_needle_size - sizeof(VolnitskyTraits::Ngram) + 1);
for (auto i = static_cast<int>(cur_needle_size - sizeof(VolnitskyTraits::Ngram)); i >= 0; --i)
{
VolnitskyTraits::putNGram<CaseSensitive, ASCII>(
reinterpret_cast<const UInt8 *>(cur_needle_data) + i,
i + 1,
reinterpret_cast<const UInt8 *>(cur_needle_data),
callback);
}
}
fallback_searchers.emplace_back(cur_needle_data, cur_needle_size);
}
return false;
}
template <typename OneSearcher, typename ResultType>
inline void searchInternal(
const ColumnString::Chars & haystack_data,
const ColumnString::Offsets & haystack_offsets,
const OneSearcher & searchFallback,
ResultType & ans)
{
const size_t haystack_string_size = haystack_offsets.size();
while (!reset())
{
size_t prev_offset = 0;
for (size_t j = 0; j < haystack_string_size; ++j)
{
const auto * haystack = &haystack_data[prev_offset];
const auto * haystack_end = haystack + haystack_offsets[j] - prev_offset - 1;
ans[j] = searchFallback(haystack, haystack_end);
prev_offset = haystack_offsets[j];
}
}
}
inline bool searchOne(const UInt8 * haystack, const UInt8 * haystack_end) const
{
const size_t fallback_size = fallback_needles.size();
for (size_t i = 0; i < fallback_size; ++i)
if (fallback_searchers[fallback_needles[i]].search(haystack, haystack_end) != haystack_end)
return true;
/// check if we have one non empty volnitsky searcher
if (step != std::numeric_limits<size_t>::max())
{
const auto * pos = haystack + step - sizeof(VolnitskyTraits::Ngram);
for (; pos <= haystack_end - sizeof(VolnitskyTraits::Ngram); pos += step)
{
for (size_t cell_num = VolnitskyTraits::toNGram(pos) % VolnitskyTraits::hash_size; hash[cell_num].off;
cell_num = (cell_num + 1) % VolnitskyTraits::hash_size)
{
if (pos >= haystack + hash[cell_num].off - 1)
{
const auto res = pos - (hash[cell_num].off - 1);
const size_t ind = hash[cell_num].id;
if (res + needles[ind].size <= haystack_end && fallback_searchers[ind].compare(res))
return true;
}
}
}
}
return false;
}
inline size_t searchOneIndex(const UInt8 * haystack, const UInt8 * haystack_end) const
{
const size_t fallback_size = fallback_needles.size();
size_t ans = std::numeric_limits<size_t>::max();
for (size_t i = 0; i < fallback_size; ++i)
if (fallback_searchers[fallback_needles[i]].search(haystack, haystack_end) != haystack_end)
ans = std::min(ans, fallback_needles[i]);
/// check if we have one non empty volnitsky searcher
if (step != std::numeric_limits<size_t>::max())
{
const auto * pos = haystack + step - sizeof(VolnitskyTraits::Ngram);
for (; pos <= haystack_end - sizeof(VolnitskyTraits::Ngram); pos += step)
{
for (size_t cell_num = VolnitskyTraits::toNGram(pos) % VolnitskyTraits::hash_size; hash[cell_num].off;
cell_num = (cell_num + 1) % VolnitskyTraits::hash_size)
{
if (pos >= haystack + hash[cell_num].off - 1)
{
const auto res = pos - (hash[cell_num].off - 1);
const size_t ind = hash[cell_num].id;
if (res + needles[ind].size <= haystack_end && fallback_searchers[ind].compare(res))
ans = std::min(ans, ind);
}
}
}
}
/*
* if nothing was found, ans + 1 will be equal to zero and we can
* assign it into the result because we need to return the position starting with one
*/
return ans + 1;
}
void putNGramBase(const VolnitskyTraits::Ngram ngram, const int offset, const size_t num)
{
size_t cell_num = ngram % VolnitskyTraits::hash_size;
while (hash[cell_num].off)
cell_num = (cell_num + 1) % VolnitskyTraits::hash_size;
hash[cell_num] = {static_cast<VolnitskyTraits::Id>(num), static_cast<VolnitskyTraits::Offset>(offset)};
}
};
using Volnitsky = VolnitskyBase<true, true, ASCIICaseSensitiveStringSearcher>;
using VolnitskyUTF8 = VolnitskyBase<true, false, ASCIICaseSensitiveStringSearcher>; /// exactly same as Volnitsky
using VolnitskyCaseInsensitive = VolnitskyBase<false, true, ASCIICaseInsensitiveStringSearcher>; /// ignores non-ASCII bytes
using VolnitskyCaseInsensitiveUTF8 = VolnitskyBase<false, false, UTF8CaseInsensitiveStringSearcher>;
using MultiVolnitsky = MultiVolnitskyBase<true, true, ASCIICaseSensitiveStringSearcher>;
using MultiVolnitskyUTF8 = MultiVolnitskyBase<true, false, ASCIICaseSensitiveStringSearcher>;
using MultiVolnitskyCaseInsensitive = MultiVolnitskyBase<false, true, ASCIICaseInsensitiveStringSearcher>;
using MultiVolnitskyCaseInsensitiveUTF8 = MultiVolnitskyBase<false, false, UTF8CaseInsensitiveStringSearcher>;
}

View File

@ -61,6 +61,7 @@ struct Request
{
Request() = default;
Request(const Request &) = default;
Request & operator=(const Request &) = default;
virtual ~Request() = default;
virtual String getPath() const = 0;
virtual void addRootPath(const String & /* root_path */) {}
@ -76,6 +77,7 @@ struct Response
int32_t error = 0;
Response() = default;
Response(const Response &) = default;
Response & operator=(const Response &) = default;
virtual ~Response() = default;
virtual void removeRootPath(const String & /* root_path */) {}
};

View File

@ -0,0 +1,156 @@
#include <Compression/CompressionCodecDelta.h>
#include <Compression/CompressionInfo.h>
#include <Compression/CompressionFactory.h>
#include <common/unaligned.h>
#include <Parsers/IAST.h>
#include <Parsers/ASTLiteral.h>
#include <IO/WriteHelpers.h>
#include <cstdlib>
namespace DB
{
namespace ErrorCodes
{
extern const int CANNOT_COMPRESS;
extern const int CANNOT_DECOMPRESS;
extern const int ILLEGAL_SYNTAX_FOR_CODEC_TYPE;
extern const int ILLEGAL_CODEC_PARAMETER;
}
CompressionCodecDelta::CompressionCodecDelta(UInt8 delta_bytes_size_)
: delta_bytes_size(delta_bytes_size_)
{
}
UInt8 CompressionCodecDelta::getMethodByte() const
{
return static_cast<UInt8>(CompressionMethodByte::Delta);
}
String CompressionCodecDelta::getCodecDesc() const
{
return "Delta(" + toString(delta_bytes_size) + ")";
}
namespace
{
template <typename T>
void compressDataForType(const char * source, UInt32 source_size, char * dest)
{
if (source_size % sizeof(T) != 0)
throw Exception("Cannot delta compress, data size " + toString(source_size) + " is not aligned to " + toString(sizeof(T)), ErrorCodes::CANNOT_COMPRESS);
T prev_src{};
const char * source_end = source + source_size;
while (source < source_end)
{
T curr_src = unalignedLoad<T>(source);
unalignedStore(dest, curr_src - prev_src);
prev_src = curr_src;
source += sizeof(T);
dest += sizeof(T);
}
}
template <typename T>
void decompressDataForType(const char * source, UInt32 source_size, char * dest)
{
if (source_size % sizeof(T) != 0)
throw Exception("Cannot delta decompress, data size " + toString(source_size) + " is not aligned to " + toString(sizeof(T)), ErrorCodes::CANNOT_DECOMPRESS);
T accumulator{};
const char * source_end = source + source_size;
while (source < source_end)
{
accumulator += unalignedLoad<T>(source);
unalignedStore(dest, accumulator);
source += sizeof(T);
dest += sizeof(T);
}
}
}
UInt32 CompressionCodecDelta::doCompressData(const char * source, UInt32 source_size, char * dest) const
{
UInt8 bytes_to_skip = source_size % delta_bytes_size;
dest[0] = delta_bytes_size;
dest[1] = bytes_to_skip;
memcpy(&dest[2], source, bytes_to_skip);
size_t start_pos = 2 + bytes_to_skip;
switch (delta_bytes_size)
{
case 1:
compressDataForType<UInt8>(&source[bytes_to_skip], source_size - bytes_to_skip, &dest[start_pos]);
break;
case 2:
compressDataForType<UInt16>(&source[bytes_to_skip], source_size - bytes_to_skip, &dest[start_pos]);
break;
case 4:
compressDataForType<UInt32>(&source[bytes_to_skip], source_size - bytes_to_skip, &dest[start_pos]);
break;
case 8:
compressDataForType<UInt64>(&source[bytes_to_skip], source_size - bytes_to_skip, &dest[start_pos]);
break;
}
return 1 + 1 + source_size;
}
void CompressionCodecDelta::doDecompressData(const char * source, UInt32 source_size, char * dest, UInt32 /* uncompressed_size */) const
{
UInt8 bytes_size = source[0];
UInt8 bytes_to_skip = source[1];
memcpy(dest, &source[2], bytes_to_skip);
UInt32 source_size_no_header = source_size - bytes_to_skip - 2;
switch (bytes_size)
{
case 1:
decompressDataForType<UInt8>(&source[2 + bytes_to_skip], source_size_no_header, &dest[bytes_to_skip]);
break;
case 2:
decompressDataForType<UInt16>(&source[2 + bytes_to_skip], source_size_no_header, &dest[bytes_to_skip]);
break;
case 4:
decompressDataForType<UInt32>(&source[2 + bytes_to_skip], source_size_no_header, &dest[bytes_to_skip]);
break;
case 8:
decompressDataForType<UInt64>(&source[2 + bytes_to_skip], source_size_no_header, &dest[bytes_to_skip]);
break;
}
}
void registerCodecDelta(CompressionCodecFactory & factory)
{
UInt8 method_code = UInt8(CompressionMethodByte::Delta);
factory.registerCompressionCodecWithType("Delta", method_code, [&](const ASTPtr & arguments, DataTypePtr column_type) -> CompressionCodecPtr
{
UInt8 delta_bytes_size = 1;
if (column_type && column_type->haveMaximumSizeOfValue())
{
size_t max_size = column_type->getSizeOfValueInMemory();
if (max_size == 1 || max_size == 2 || max_size == 4 || max_size == 8)
delta_bytes_size = static_cast<UInt8>(max_size);
}
if (arguments && !arguments->children.empty())
{
if (arguments->children.size() > 1)
throw Exception("Delta codec must have 1 parameter, given " + std::to_string(arguments->children.size()), ErrorCodes::ILLEGAL_SYNTAX_FOR_CODEC_TYPE);
const auto children = arguments->children;
const ASTLiteral * literal = static_cast<const ASTLiteral *>(children[0].get());
size_t user_bytes_size = literal->value.safeGet<UInt64>();
if (user_bytes_size != 1 && user_bytes_size != 2 && user_bytes_size != 4 && user_bytes_size != 8)
throw Exception("Delta value for delta codec can be 1, 2, 4 or 8, given " + toString(user_bytes_size), ErrorCodes::ILLEGAL_CODEC_PARAMETER);
delta_bytes_size = static_cast<UInt8>(user_bytes_size);
}
return std::make_shared<CompressionCodecDelta>(delta_bytes_size);
});
}
}

View File

@ -0,0 +1,26 @@
#pragma once
#include <Compression/ICompressionCodec.h>
namespace DB
{
class CompressionCodecDelta : public ICompressionCodec
{
public:
CompressionCodecDelta(UInt8 delta_bytes_size_);
UInt8 getMethodByte() const override;
String getCodecDesc() const override;
protected:
UInt32 doCompressData(const char * source, UInt32 source_size, char * dest) const override;
void doDecompressData(const char * source, UInt32 source_size, char * dest, UInt32 uncompressed_size) const override;
UInt32 getMaxCompressedDataSize(UInt32 uncompressed_size) const override { return uncompressed_size + 2; }
private:
const UInt8 delta_bytes_size;
};
}

View File

@ -8,6 +8,7 @@
#include "CompressionCodecLZ4.h"
#include <Parsers/IAST.h>
#include <Parsers/ASTLiteral.h>
#include <IO/WriteHelpers.h>
#ifdef __clang__
#pragma clang diagnostic ignored "-Wold-style-cast"
@ -35,7 +36,7 @@ String CompressionCodecLZ4::getCodecDesc() const
return "LZ4";
}
UInt32 CompressionCodecLZ4::getCompressedDataSize(UInt32 uncompressed_size) const
UInt32 CompressionCodecLZ4::getMaxCompressedDataSize(UInt32 uncompressed_size) const
{
return LZ4_COMPRESSBOUND(uncompressed_size);
}
@ -61,7 +62,7 @@ void registerCodecLZ4(CompressionCodecFactory & factory)
String CompressionCodecLZ4HC::getCodecDesc() const
{
return "LZ4HC";
return "LZ4HC(" + toString(level) + ")";
}
UInt32 CompressionCodecLZ4HC::doCompressData(const char * source, UInt32 source_size, char * dest) const

View File

@ -24,7 +24,7 @@ protected:
private:
void doDecompressData(const char * source, UInt32 source_size, char * dest, UInt32 uncompressed_size) const override;
UInt32 getCompressedDataSize(UInt32 uncompressed_size) const override;
UInt32 getMaxCompressedDataSize(UInt32 uncompressed_size) const override;
mutable LZ4::PerformanceStatistics lz4_stat;
};
@ -41,7 +41,7 @@ protected:
UInt32 doCompressData(const char * source, UInt32 source_size, char * dest) const override;
private:
int level;
const int level;
};
}

View File

@ -5,6 +5,7 @@
#include <IO/ReadHelpers.h>
#include <IO/WriteHelpers.h>
#include <Common/hex.h>
#include <sstream>
namespace DB
@ -20,14 +21,16 @@ extern const int CORRUPTED_DATA;
CompressionCodecMultiple::CompressionCodecMultiple(Codecs codecs)
: codecs(codecs)
{
std::ostringstream ss;
for (size_t idx = 0; idx < codecs.size(); idx++)
{
if (idx != 0)
codec_desc = codec_desc + ',';
ss << ',' << ' ';
const auto codec = codecs[idx];
codec_desc = codec_desc + codec->getCodecDesc();
ss << codec->getCodecDesc();
}
codec_desc = ss.str();
}
UInt8 CompressionCodecMultiple::getMethodByte() const
@ -40,7 +43,7 @@ String CompressionCodecMultiple::getCodecDesc() const
return codec_desc;
}
UInt32 CompressionCodecMultiple::getCompressedDataSize(UInt32 uncompressed_size) const
UInt32 CompressionCodecMultiple::getMaxCompressedDataSize(UInt32 uncompressed_size) const
{
UInt32 compressed_size = uncompressed_size;
for (auto & codec : codecs)

View File

@ -15,7 +15,7 @@ public:
String getCodecDesc() const override;
UInt32 getCompressedDataSize(UInt32 uncompressed_size) const override;
UInt32 getMaxCompressedDataSize(UInt32 uncompressed_size) const override;
protected:
UInt32 doCompressData(const char * source, UInt32 source_size, char * dest) const override;

View File

@ -28,10 +28,10 @@ UInt8 CompressionCodecZSTD::getMethodByte() const
String CompressionCodecZSTD::getCodecDesc() const
{
return "ZSTD";
return "ZSTD(" + toString(level) + ")";
}
UInt32 CompressionCodecZSTD::getCompressedDataSize(UInt32 uncompressed_size) const
UInt32 CompressionCodecZSTD::getMaxCompressedDataSize(UInt32 uncompressed_size) const
{
return ZSTD_compressBound(uncompressed_size);
}

View File

@ -19,7 +19,7 @@ public:
String getCodecDesc() const override;
UInt32 getCompressedDataSize(UInt32 uncompressed_size) const override;
UInt32 getMaxCompressedDataSize(UInt32 uncompressed_size) const override;
protected:
UInt32 doCompressData(const char * source, UInt32 source_size, char * dest) const override;
@ -27,7 +27,7 @@ protected:
void doDecompressData(const char * source, UInt32 source_size, char * dest, UInt32 uncompressed_size) const override;
private:
int level;
const int level;
};
}

View File

@ -10,7 +10,6 @@
#include <Parsers/queryToString.h>
#include <Compression/CompressionCodecMultiple.h>
#include <Compression/CompressionCodecLZ4.h>
#include <Compression/CompressionCodecNone.h>
#include <IO/WriteHelpers.h>
@ -56,7 +55,7 @@ CompressionCodecPtr CompressionCodecFactory::get(const std::vector<CodecNameWith
return std::make_shared<CompressionCodecMultiple>(result);
}
CompressionCodecPtr CompressionCodecFactory::get(const ASTPtr & ast) const
CompressionCodecPtr CompressionCodecFactory::get(const ASTPtr & ast, DataTypePtr column_type) const
{
if (const auto * func = typeid_cast<const ASTFunction *>(ast.get()))
{
@ -65,9 +64,9 @@ CompressionCodecPtr CompressionCodecFactory::get(const ASTPtr & ast) const
for (const auto & inner_codec_ast : func->arguments->children)
{
if (const auto * family_name = typeid_cast<const ASTIdentifier *>(inner_codec_ast.get()))
codecs.emplace_back(getImpl(family_name->name, {}));
codecs.emplace_back(getImpl(family_name->name, {}, column_type));
else if (const auto * ast_func = typeid_cast<const ASTFunction *>(inner_codec_ast.get()))
codecs.emplace_back(getImpl(ast_func->name, ast_func->arguments));
codecs.emplace_back(getImpl(ast_func->name, ast_func->arguments, column_type));
else
throw Exception("Unexpected AST element for compression codec", ErrorCodes::UNEXPECTED_AST_STRUCTURE);
}
@ -88,10 +87,11 @@ CompressionCodecPtr CompressionCodecFactory::get(const UInt8 byte_code) const
if (family_code_and_creator == family_code_with_codec.end())
throw Exception("Unknown codec family code : " + toString(byte_code), ErrorCodes::UNKNOWN_CODEC);
return family_code_and_creator->second({});
return family_code_and_creator->second({}, nullptr);
}
CompressionCodecPtr CompressionCodecFactory::getImpl(const String & family_name, const ASTPtr & arguments) const
CompressionCodecPtr CompressionCodecFactory::getImpl(const String & family_name, const ASTPtr & arguments, DataTypePtr column_type) const
{
if (family_name == "Multiple")
throw Exception("Codec MULTIPLE cannot be specified directly", ErrorCodes::UNKNOWN_CODEC);
@ -101,10 +101,13 @@ CompressionCodecPtr CompressionCodecFactory::getImpl(const String & family_name,
if (family_and_creator == family_name_with_codec.end())
throw Exception("Unknown codec family: " + family_name, ErrorCodes::UNKNOWN_CODEC);
return family_and_creator->second(arguments);
return family_and_creator->second(arguments, column_type);
}
void CompressionCodecFactory::registerCompressionCodec(const String & family_name, std::optional<UInt8> byte_code, Creator creator)
void CompressionCodecFactory::registerCompressionCodecWithType(
const String & family_name,
std::optional<UInt8> byte_code,
CreatorWithType creator)
{
if (creator == nullptr)
throw Exception("CompressionCodecFactory: the codec family " + family_name + " has been provided a null constructor",
@ -118,8 +121,18 @@ void CompressionCodecFactory::registerCompressionCodec(const String & family_nam
throw Exception("CompressionCodecFactory: the codec family name '" + family_name + "' is not unique", ErrorCodes::LOGICAL_ERROR);
}
void CompressionCodecFactory::registerSimpleCompressionCodec(const String & family_name, std::optional<UInt8> byte_code,
std::function<CompressionCodecPtr()> creator)
void CompressionCodecFactory::registerCompressionCodec(const String & family_name, std::optional<UInt8> byte_code, Creator creator)
{
registerCompressionCodecWithType(family_name, byte_code, [family_name, creator](const ASTPtr & ast, DataTypePtr /* data_type */)
{
return creator(ast);
});
}
void CompressionCodecFactory::registerSimpleCompressionCodec(
const String & family_name,
std::optional<UInt8> byte_code,
SimpleCreator creator)
{
registerCompressionCodec(family_name, byte_code, [family_name, creator](const ASTPtr & ast)
{
@ -135,7 +148,7 @@ void registerCodecNone(CompressionCodecFactory & factory);
void registerCodecZSTD(CompressionCodecFactory & factory);
void registerCodecMultiple(CompressionCodecFactory & factory);
void registerCodecLZ4HC(CompressionCodecFactory & factory);
//void registerCodecDelta(CompressionCodecFactory & factory);
void registerCodecDelta(CompressionCodecFactory & factory);
CompressionCodecFactory::CompressionCodecFactory()
{
@ -145,7 +158,7 @@ CompressionCodecFactory::CompressionCodecFactory()
registerCodecZSTD(*this);
registerCodecMultiple(*this);
registerCodecLZ4HC(*this);
// registerCodecDelta(*this);
registerCodecDelta(*this);
}
}

View File

@ -5,6 +5,7 @@
#include <optional>
#include <unordered_map>
#include <ext/singleton.h>
#include <DataTypes/IDataType.h>
#include <Common/IFactoryWithAliases.h>
#include <Compression/ICompressionCodec.h>
#include <Compression/CompressionInfo.h>
@ -28,16 +29,19 @@ class CompressionCodecFactory final : public ext::singleton<CompressionCodecFact
{
protected:
using Creator = std::function<CompressionCodecPtr(const ASTPtr & parameters)>;
using CreatorWithType = std::function<CompressionCodecPtr(const ASTPtr & parameters, DataTypePtr column_type)>;
using SimpleCreator = std::function<CompressionCodecPtr()>;
using CompressionCodecsDictionary = std::unordered_map<String, Creator>;
using CompressionCodecsCodeDictionary = std::unordered_map<UInt8, Creator>;
using CompressionCodecsDictionary = std::unordered_map<String, CreatorWithType>;
using CompressionCodecsCodeDictionary = std::unordered_map<UInt8, CreatorWithType>;
public:
/// Return default codec (currently LZ4)
CompressionCodecPtr getDefaultCodec() const;
/// Get codec by AST
CompressionCodecPtr get(const ASTPtr & ast) const;
/// Get codec by AST and possible column_type
/// some codecs can use information about type to improve inner settings
/// but every codec should be able to work without information about type
CompressionCodecPtr get(const ASTPtr & ast, DataTypePtr column_type=nullptr) const;
/// Get codec by method byte (no params available)
CompressionCodecPtr get(const UInt8 byte_code) const;
@ -46,6 +50,9 @@ public:
CompressionCodecPtr get(const String & family_name, std::optional<int> level) const;
CompressionCodecPtr get(const std::vector<CodecNameWithLevel> & codecs) const;
/// Register codec with parameters and column type
void registerCompressionCodecWithType(const String & family_name, std::optional<UInt8> byte_code, CreatorWithType creator);
/// Register codec with parameters
void registerCompressionCodec(const String & family_name, std::optional<UInt8> byte_code, Creator creator);
@ -53,7 +60,7 @@ public:
void registerSimpleCompressionCodec(const String & family_name, std::optional<UInt8> byte_code, SimpleCreator creator);
protected:
CompressionCodecPtr getImpl(const String & family_name, const ASTPtr & arguments) const;
CompressionCodecPtr getImpl(const String & family_name, const ASTPtr & arguments, DataTypePtr column_type) const;
private:
CompressionCodecsDictionary family_name_with_codec;

View File

@ -39,6 +39,7 @@ enum class CompressionMethodByte : uint8_t
LZ4 = 0x82,
ZSTD = 0x90,
Multiple = 0x91,
Delta = 0x92,
};
}

View File

@ -41,7 +41,7 @@ public:
virtual UInt32 decompress(char * source, UInt32 source_size, char * dest) const;
/// Number of bytes, that will be used to compress uncompressed_size bytes with current codec
virtual UInt32 getCompressedReserveSize(UInt32 uncompressed_size) const { return getHeaderSize() + getCompressedDataSize(uncompressed_size); }
virtual UInt32 getCompressedReserveSize(UInt32 uncompressed_size) const { return getHeaderSize() + getMaxCompressedDataSize(uncompressed_size); }
/// Some codecs (LZ4, for example) require additional bytes at end of buffer
virtual UInt32 getAdditionalSizeAtTheEndOfBuffer() const { return 0; }
@ -61,7 +61,7 @@ public:
protected:
/// Return size of compressed data without header
virtual UInt32 getCompressedDataSize(UInt32 uncompressed_size) const { return uncompressed_size; }
virtual UInt32 getMaxCompressedDataSize(UInt32 uncompressed_size) const { return uncompressed_size; }
/// Actually compress data, without header
virtual UInt32 doCompressData(const char * source, UInt32 source_size, char * dest) const = 0;

View File

@ -14,6 +14,7 @@
#include <boost/range/join.hpp>
#include <common/logger_useful.h>
namespace DB
{
@ -22,6 +23,7 @@ namespace ErrorCodes
extern const int BAD_TYPE_OF_FIELD;
extern const int BAD_ARGUMENTS;
extern const int THERE_IS_NO_COLUMN;
extern const int LOGICAL_ERROR;
}
static String getSchemaPath(const String & schema_dir, const String & schema_file)
@ -107,52 +109,50 @@ capnp::StructSchema::Field getFieldOrThrow(capnp::StructSchema node, const std::
else
throw Exception("Field " + field + " doesn't exist in schema " + node.getShortDisplayName().cStr(), ErrorCodes::THERE_IS_NO_COLUMN);
}
bool checkEqualFrom(const std::vector<std::string> &a,const std::vector<std::string> &b, const size_t index)
void CapnProtoRowInputStream::createActions(const NestedFieldList & sorted_fields, capnp::StructSchema reader)
{
for(int i = index; i >= 0; i++)
{
if(a[i] != b[i])
return false;
}
return true;
}
void CapnProtoRowInputStream::createActions(const NestedFieldList & sortedFields, capnp::StructSchema reader)
{
//Store parents and their tokens in order to backtrack
/// Columns in a table can map to fields in Cap'n'Proto or to structs.
/// Store common parents and their tokens in order to backtrack.
std::vector<capnp::StructSchema::Field> parents;
std::vector<std::string> tokens;
std::vector<std::string> parent_tokens;
capnp::StructSchema cur_reader = reader;
size_t level = 0;
for (const auto & field : sortedFields)
for (const auto & field : sorted_fields)
{
//Backtrackt to common parent
while(level > (field.tokens.size()-1) || !checkEqualFrom(tokens,field.tokens,level-1))
if (field.tokens.empty())
throw Exception("Logical error in CapnProtoRowInputStream", ErrorCodes::LOGICAL_ERROR);
// Backtrack to common parent
while (field.tokens.size() < parent_tokens.size() + 1
|| !std::equal(parent_tokens.begin(), parent_tokens.end(), field.tokens.begin()))
{
level--;
actions.push_back({Action::POP});
tokens.pop_back();
parents.pop_back();
if(level > 0)
parent_tokens.pop_back();
if (parents.empty())
{
cur_reader = parents[level-1].getType().asStruct();
}
else {
cur_reader = reader;
break;
}
else
cur_reader = parents.back().getType().asStruct();
}
//Go forward
for (; level < field.tokens.size() - 1; ++level)
{
auto node = getFieldOrThrow(cur_reader, field.tokens[level]);
// Go forward
while (parent_tokens.size() + 1 < field.tokens.size())
{
const auto & token = field.tokens[parents.size()];
auto node = getFieldOrThrow(cur_reader, token);
if (node.getType().isStruct())
{
// Descend to field structure
parents.push_back(node);
tokens.push_back(field.tokens[level]);
parents.emplace_back(node);
parent_tokens.emplace_back(token);
cur_reader = node.getType().asStruct();
actions.push_back({Action::PUSH, node});
}
@ -161,11 +161,11 @@ void CapnProtoRowInputStream::createActions(const NestedFieldList & sortedFields
break; // Collect list
}
else
throw Exception("Field " + field.tokens[level] + "is neither Struct nor List", ErrorCodes::BAD_TYPE_OF_FIELD);
throw Exception("Field " + token + " is neither Struct nor List", ErrorCodes::BAD_TYPE_OF_FIELD);
}
// Read field from the structure
auto node = getFieldOrThrow(cur_reader, field.tokens[level]);
auto node = getFieldOrThrow(cur_reader, field.tokens[parents.size()]);
if (node.getType().isList() && actions.size() > 0 && actions.back().field == node)
{
// The field list here flattens Nested elements into multiple arrays
@ -187,7 +187,6 @@ void CapnProtoRowInputStream::createActions(const NestedFieldList & sortedFields
CapnProtoRowInputStream::CapnProtoRowInputStream(ReadBuffer & istr_, const Block & header_, const String & schema_dir, const String & schema_file, const String & root_object)
: istr(istr_), header(header_), parser(std::make_shared<SchemaParser>())
{
// Parse the schema and fetch the root object
#pragma GCC diagnostic push
@ -207,15 +206,8 @@ CapnProtoRowInputStream::CapnProtoRowInputStream(ReadBuffer & istr_, const Block
for (size_t i = 0; i < num_columns; ++i)
list.push_back(split(header, i));
// Order list first by value of strings then by length of sting vector.
std::sort(list.begin(), list.end(), [](const NestedField & a, const NestedField & b)
{
size_t min = std::min(a.tokens.size(),b.tokens.size());
for(size_t i = 0; i < min; i++)
if(a.tokens[i] != b.tokens[i])
return a.tokens[i] > b.tokens[i];
return a.tokens.size() < b.tokens.size();
});
// Order list first by value of strings then by length of string vector.
std::sort(list.begin(), list.end(), [](const NestedField & a, const NestedField & b) { return a.tokens < b.tokens; });
createActions(list, root);
}

View File

@ -1,28 +1,28 @@
#include <Functions/FunctionsStringSearch.h>
#include <memory>
#include <mutex>
#include <Poco/UTF8String.h>
#include <Columns/ColumnFixedString.h>
#include <Common/Volnitsky.h>
#include <Common/config.h>
#include <DataTypes/DataTypeFixedString.h>
#include <Functions/FunctionFactory.h>
#include <Functions/Regexps.h>
#include <IO/WriteHelpers.h>
#include <Common/config.h>
#include <re2/re2.h>
#include <re2/stringpiece.h>
#include <Poco/UTF8String.h>
#include <Common/Volnitsky.h>
#include <algorithm>
#include <memory>
#if USE_RE2_ST
#include <re2_st/re2.h> // Y_IGNORE
# include <re2_st/re2.h> // Y_IGNORE
#else
#define re2_st re2
# define re2_st re2
#endif
namespace DB
{
namespace ErrorCodes
{
extern const int BAD_ARGUMENTS;
@ -35,7 +35,10 @@ namespace ErrorCodes
struct PositionCaseSensitiveASCII
{
/// For searching single substring inside big-enough contiguous chunk of data. Coluld have slightly expensive initialization.
using SearcherInBigHaystack = VolnitskyImpl<true, true>;
using SearcherInBigHaystack = Volnitsky;
/// For search many substrings in one string
using MultiSearcherInBigHaystack = MultiVolnitsky;
/// For searching single substring, that is different each time. This object is created for each row of data. It must have cheap initialization.
using SearcherInSmallHaystack = LibCASCIICaseSensitiveStringSearcher;
@ -50,23 +53,24 @@ struct PositionCaseSensitiveASCII
return SearcherInSmallHaystack(needle_data, needle_size);
}
/// Number of code points between 'begin' and 'end' (this has different behaviour for ASCII and UTF-8).
static size_t countChars(const char * begin, const char * end)
static MultiSearcherInBigHaystack createMultiSearcherInBigHaystack(const std::vector<StringRef> & needles)
{
return end - begin;
return MultiSearcherInBigHaystack(needles);
}
/// Number of code points between 'begin' and 'end' (this has different behaviour for ASCII and UTF-8).
static size_t countChars(const char * begin, const char * end) { return end - begin; }
/// Convert string to lowercase. Only for case-insensitive search.
/// Implementation is permitted to be inefficient because it is called for single string.
static void toLowerIfNeed(std::string &)
{
}
static void toLowerIfNeed(std::string &) {}
};
struct PositionCaseInsensitiveASCII
{
/// `Volnitsky` is not used here, because one person has measured that this is better. It will be good if you question it.
using SearcherInBigHaystack = ASCIICaseInsensitiveStringSearcher;
using MultiSearcherInBigHaystack = MultiVolnitskyCaseInsensitive;
using SearcherInSmallHaystack = LibCASCIICaseInsensitiveStringSearcher;
static SearcherInBigHaystack createSearcherInBigHaystack(const char * needle_data, size_t needle_size, size_t /*haystack_size_hint*/)
@ -79,20 +83,20 @@ struct PositionCaseInsensitiveASCII
return SearcherInSmallHaystack(needle_data, needle_size);
}
static size_t countChars(const char * begin, const char * end)
static MultiSearcherInBigHaystack createMultiSearcherInBigHaystack(const std::vector<StringRef> & needles)
{
return end - begin;
return MultiSearcherInBigHaystack(needles);
}
static void toLowerIfNeed(std::string & s)
{
std::transform(std::begin(s), std::end(s), std::begin(s), tolower);
}
static size_t countChars(const char * begin, const char * end) { return end - begin; }
static void toLowerIfNeed(std::string & s) { std::transform(std::begin(s), std::end(s), std::begin(s), tolower); }
};
struct PositionCaseSensitiveUTF8
{
using SearcherInBigHaystack = VolnitskyImpl<true, false>;
using SearcherInBigHaystack = VolnitskyUTF8;
using MultiSearcherInBigHaystack = MultiVolnitskyUTF8;
using SearcherInSmallHaystack = LibCASCIICaseSensitiveStringSearcher;
static SearcherInBigHaystack createSearcherInBigHaystack(const char * needle_data, size_t needle_size, size_t haystack_size_hint)
@ -105,6 +109,11 @@ struct PositionCaseSensitiveUTF8
return SearcherInSmallHaystack(needle_data, needle_size);
}
static MultiSearcherInBigHaystack createMultiSearcherInBigHaystack(const std::vector<StringRef> & needles)
{
return MultiSearcherInBigHaystack(needles);
}
static size_t countChars(const char * begin, const char * end)
{
size_t res = 0;
@ -114,14 +123,13 @@ struct PositionCaseSensitiveUTF8
return res;
}
static void toLowerIfNeed(std::string &)
{
}
static void toLowerIfNeed(std::string &) {}
};
struct PositionCaseInsensitiveUTF8
{
using SearcherInBigHaystack = VolnitskyImpl<false, false>;
using SearcherInBigHaystack = VolnitskyCaseInsensitiveUTF8;
using MultiSearcherInBigHaystack = MultiVolnitskyCaseInsensitiveUTF8;
using SearcherInSmallHaystack = UTF8CaseInsensitiveStringSearcher; /// TODO Very suboptimal.
static SearcherInBigHaystack createSearcherInBigHaystack(const char * needle_data, size_t needle_size, size_t haystack_size_hint)
@ -134,6 +142,11 @@ struct PositionCaseInsensitiveUTF8
return SearcherInSmallHaystack(needle_data, needle_size);
}
static MultiSearcherInBigHaystack createMultiSearcherInBigHaystack(const std::vector<StringRef> & needles)
{
return MultiSearcherInBigHaystack(needles);
}
static size_t countChars(const char * begin, const char * end)
{
size_t res = 0;
@ -143,10 +156,7 @@ struct PositionCaseInsensitiveUTF8
return res;
}
static void toLowerIfNeed(std::string & s)
{
Poco::UTF8::toLowerInPlace(s);
}
static void toLowerIfNeed(std::string & s) { Poco::UTF8::toLowerInPlace(s); }
};
template <typename Impl>
@ -155,10 +165,8 @@ struct PositionImpl
using ResultType = UInt64;
/// Find one substring in many strings.
static void vector_constant(const ColumnString::Chars & data,
const ColumnString::Offsets & offsets,
const std::string & needle,
PaddedPODArray<UInt64> & res)
static void vector_constant(
const ColumnString::Chars & data, const ColumnString::Offsets & offsets, const std::string & needle, PaddedPODArray<UInt64> & res)
{
const UInt8 * begin = data.data();
const UInt8 * pos = begin;
@ -210,7 +218,8 @@ struct PositionImpl
}
/// Search each time for a different single substring inside each time different string.
static void vector_vector(const ColumnString::Chars & haystack_data,
static void vector_vector(
const ColumnString::Chars & haystack_data,
const ColumnString::Offsets & haystack_offsets,
const ColumnString::Chars & needle_data,
const ColumnString::Offsets & needle_offsets,
@ -234,9 +243,9 @@ struct PositionImpl
else
{
/// It is assumed that the StringSearcher is not very difficult to initialize.
typename Impl::SearcherInSmallHaystack searcher
= Impl::createSearcherInSmallHaystack(reinterpret_cast<const char *>(&needle_data[prev_needle_offset]),
needle_offsets[i] - prev_needle_offset - 1); /// zero byte at the end
typename Impl::SearcherInSmallHaystack searcher = Impl::createSearcherInSmallHaystack(
reinterpret_cast<const char *>(&needle_data[prev_needle_offset]),
needle_offsets[i] - prev_needle_offset - 1); /// zero byte at the end
/// searcher returns a pointer to the found substring or to the end of `haystack`.
size_t pos = searcher.search(&haystack_data[prev_haystack_offset], &haystack_data[haystack_offsets[i] - 1])
@ -244,8 +253,10 @@ struct PositionImpl
if (pos != haystack_size)
{
res[i] = 1 + Impl::countChars(reinterpret_cast<const char *>(&haystack_data[prev_haystack_offset]),
reinterpret_cast<const char *>(&haystack_data[prev_haystack_offset + pos]));
res[i] = 1
+ Impl::countChars(
reinterpret_cast<const char *>(&haystack_data[prev_haystack_offset]),
reinterpret_cast<const char *>(&haystack_data[prev_haystack_offset + pos]));
}
else
res[i] = 0;
@ -256,8 +267,9 @@ struct PositionImpl
}
}
/// Find many substrings in one line.
static void constant_vector(const String & haystack,
/// Find many substrings in single string.
static void constant_vector(
const String & haystack,
const ColumnString::Chars & needle_data,
const ColumnString::Offsets & needle_offsets,
PaddedPODArray<UInt64> & res)
@ -281,7 +293,8 @@ struct PositionImpl
typename Impl::SearcherInSmallHaystack searcher = Impl::createSearcherInSmallHaystack(
reinterpret_cast<const char *>(&needle_data[prev_needle_offset]), needle_offsets[i] - prev_needle_offset - 1);
size_t pos = searcher.search(reinterpret_cast<const UInt8 *>(haystack.data()),
size_t pos = searcher.search(
reinterpret_cast<const UInt8 *>(haystack.data()),
reinterpret_cast<const UInt8 *>(haystack.data()) + haystack.size())
- reinterpret_cast<const UInt8 *>(haystack.data());
@ -298,6 +311,56 @@ struct PositionImpl
}
};
template <typename Impl>
struct MultiPositionImpl
{
using ResultType = UInt64;
static void vector_constant(
const ColumnString::Chars & haystack_data,
const ColumnString::Offsets & haystack_offsets,
const std::vector<StringRef> & needles,
PaddedPODArray<UInt64> & res)
{
auto resCallback = [](const UInt8 * start, const UInt8 * end) -> UInt64
{
return 1 + Impl::countChars(reinterpret_cast<const char *>(start), reinterpret_cast<const char *>(end));
};
Impl::createMultiSearcherInBigHaystack(needles).searchAll(haystack_data, haystack_offsets, resCallback, res);
}
};
template <typename Impl>
struct MultiSearchImpl
{
using ResultType = UInt64;
static void vector_constant(
const ColumnString::Chars & haystack_data,
const ColumnString::Offsets & haystack_offsets,
const std::vector<StringRef> & needles,
PaddedPODArray<UInt64> & res)
{
Impl::createMultiSearcherInBigHaystack(needles).search(haystack_data, haystack_offsets, res);
}
};
template <typename Impl>
struct FirstMatchImpl
{
using ResultType = UInt64;
static void vector_constant(
const ColumnString::Chars & haystack_data,
const ColumnString::Offsets & haystack_offsets,
const std::vector<StringRef> & needles,
PaddedPODArray<UInt64> & res)
{
Impl::createMultiSearcherInBigHaystack(needles).searchIndex(haystack_data, haystack_offsets, res);
}
};
/// Is the LIKE expression reduced to finding a substring in a string?
inline bool likePatternIsStrstr(const String & pattern, String & res)
@ -348,10 +411,8 @@ struct MatchImpl
{
using ResultType = UInt8;
static void vector_constant(const ColumnString::Chars & data,
const ColumnString::Offsets & offsets,
const std::string & pattern,
PaddedPODArray<UInt8> & res)
static void vector_constant(
const ColumnString::Chars & data, const ColumnString::Offsets & offsets, const std::string & pattern, PaddedPODArray<UInt8> & res)
{
if (offsets.empty())
return;
@ -467,13 +528,14 @@ struct MatchImpl
size_t str_size = (i != 0 ? offsets[i] - offsets[i - 1] : offsets[0]) - 1;
/** Even in the case of `required_substring_is_prefix` use UNANCHORED check for regexp,
* so that it can match when `required_substring` occurs into the line several times,
* so that it can match when `required_substring` occurs into the string several times,
* and at the first occurrence, the regexp is not a match.
*/
if (required_substring_is_prefix)
res[i] = revert
^ regexp->getRE2()->Match(re2_st::StringPiece(str_data, str_size),
^ regexp->getRE2()->Match(
re2_st::StringPiece(str_data, str_size),
reinterpret_cast<const char *>(pos) - str_data,
str_size,
re2_st::RE2::UNANCHORED,
@ -504,13 +566,15 @@ struct MatchImpl
res = revert ^ regexp->match(data);
}
template <typename... Args> static void vector_vector(Args &&...)
template <typename... Args>
static void vector_vector(Args &&...)
{
throw Exception("Functions 'like' and 'match' don't support non-constant needle argument", ErrorCodes::ILLEGAL_COLUMN);
}
/// Search different needles in single haystack.
template <typename... Args> static void constant_vector(Args &&...)
template <typename... Args>
static void constant_vector(Args &&...)
{
throw Exception("Functions 'like' and 'match' don't support non-constant needle argument", ErrorCodes::ILLEGAL_COLUMN);
}
@ -519,7 +583,8 @@ struct MatchImpl
struct ExtractImpl
{
static void vector(const ColumnString::Chars & data,
static void vector(
const ColumnString::Chars & data,
const ColumnString::Offsets & offsets,
const std::string & pattern,
ColumnString::Chars & res_data,
@ -613,16 +678,17 @@ struct ReplaceRegexpImpl
for (const auto & it : instructions)
if (it.first >= num_captures)
throw Exception("Invalid replace instruction in replacement string. Id: " + toString(it.first) + ", but regexp has only "
+ toString(num_captures - 1)
+ " subpatterns",
throw Exception(
"Invalid replace instruction in replacement string. Id: " + toString(it.first) + ", but regexp has only "
+ toString(num_captures - 1) + " subpatterns",
ErrorCodes::BAD_ARGUMENTS);
return instructions;
}
static void processString(const re2_st::StringPiece & input,
static void processString(
const re2_st::StringPiece & input,
ColumnString::Chars & res_data,
ColumnString::Offset & res_offset,
re2_st::RE2 & searcher,
@ -687,7 +753,8 @@ struct ReplaceRegexpImpl
}
static void vector(const ColumnString::Chars & data,
static void vector(
const ColumnString::Chars & data,
const ColumnString::Offsets & offsets,
const std::string & needle,
const std::string & replacement,
@ -715,7 +782,8 @@ struct ReplaceRegexpImpl
}
}
static void vector_fixed(const ColumnString::Chars & data,
static void vector_fixed(
const ColumnString::Chars & data,
size_t n,
const std::string & needle,
const std::string & replacement,
@ -749,7 +817,8 @@ struct ReplaceRegexpImpl
template <bool replace_one = false>
struct ReplaceStringImpl
{
static void vector(const ColumnString::Chars & data,
static void vector(
const ColumnString::Chars & data,
const ColumnString::Offsets & offsets,
const std::string & needle,
const std::string & replacement,
@ -791,7 +860,7 @@ struct ReplaceStringImpl
if (i == offsets.size())
break;
/// Is it true that this line no longer needs to perform transformations.
/// Is it true that this string no longer needs to perform transformations.
bool can_finish_current_string = false;
/// We check that the entry does not go through the boundaries of strings.
@ -824,7 +893,8 @@ struct ReplaceStringImpl
/// Note: this function converts fixed-length strings to variable-length strings
/// and each variable-length string should ends with zero byte.
static void vector_fixed(const ColumnString::Chars & data,
static void vector_fixed(
const ColumnString::Chars & data,
size_t n,
const std::string & needle,
const std::string & replacement,
@ -851,7 +921,8 @@ struct ReplaceStringImpl
const UInt8 * match = searcher.search(pos, end - pos);
#define COPY_REST_OF_CURRENT_STRING() \
do { \
do \
{ \
const size_t len = begin + n * (i + 1) - pos; \
res_data.resize(res_data.size() + len + 1); \
memcpy(&res_data[res_offset], pos, len); \
@ -878,7 +949,7 @@ struct ReplaceStringImpl
memcpy(&res_data[res_offset], pos, match - pos);
res_offset += (match - pos);
/// Is it true that this line no longer needs to perform conversions.
/// Is it true that this string no longer needs to perform conversions.
bool can_finish_current_string = false;
/// We check that the entry does not pass through the boundaries of strings.
@ -935,20 +1006,11 @@ class FunctionStringReplace : public IFunction
{
public:
static constexpr auto name = Name::name;
static FunctionPtr create(const Context &)
{
return std::make_shared<FunctionStringReplace>();
}
static FunctionPtr create(const Context &) { return std::make_shared<FunctionStringReplace>(); }
String getName() const override
{
return name;
}
String getName() const override { return name; }
size_t getNumberOfArguments() const override
{
return 3;
}
size_t getNumberOfArguments() const override { return 3; }
bool useDefaultImplementationForConstants() const override { return true; }
ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1, 2}; }
@ -956,15 +1018,18 @@ public:
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
{
if (!isStringOrFixedString(arguments[0]))
throw Exception("Illegal type " + arguments[0]->getName() + " of first argument of function " + getName(),
throw Exception(
"Illegal type " + arguments[0]->getName() + " of first argument of function " + getName(),
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
if (!isStringOrFixedString(arguments[1]))
throw Exception("Illegal type " + arguments[1]->getName() + " of second argument of function " + getName(),
throw Exception(
"Illegal type " + arguments[1]->getName() + " of second argument of function " + getName(),
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
if (!isStringOrFixedString(arguments[2]))
throw Exception("Illegal type " + arguments[2]->getName() + " of third argument of function " + getName(),
throw Exception(
"Illegal type " + arguments[2]->getName() + " of third argument of function " + getName(),
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
return std::make_shared<DataTypeString>();
@ -1025,6 +1090,54 @@ struct NamePositionCaseInsensitiveUTF8
{
static constexpr auto name = "positionCaseInsensitiveUTF8";
};
struct NameMultiPosition
{
static constexpr auto name = "multiPosition";
};
struct NameMultiPositionUTF8
{
static constexpr auto name = "multiPositionUTF8";
};
struct NameMultiPositionCaseInsensitive
{
static constexpr auto name = "multiPositionCaseInsensitive";
};
struct NameMultiPositionCaseInsensitiveUTF8
{
static constexpr auto name = "multiPositionCaseInsensitiveUTF8";
};
struct NameMultiSearch
{
static constexpr auto name = "multiSearch";
};
struct NameMultiSearchUTF8
{
static constexpr auto name = "multiSearchUTF8";
};
struct NameMultiSearchCaseInsensitive
{
static constexpr auto name = "multiSearchCaseInsensitive";
};
struct NameMultiSearchCaseInsensitiveUTF8
{
static constexpr auto name = "multiSearchCaseInsensitiveUTF8";
};
struct NameFirstMatch
{
static constexpr auto name = "firstMatch";
};
struct NameFirstMatchUTF8
{
static constexpr auto name = "firstMatchUTF8";
};
struct NameFirstMatchCaseInsensitive
{
static constexpr auto name = "firstMatchCaseInsensitive";
};
struct NameFirstMatchCaseInsensitiveUTF8
{
static constexpr auto name = "firstMatchCaseInsensitiveUTF8";
};
struct NameMatch
{
static constexpr auto name = "match";
@ -1064,6 +1177,27 @@ using FunctionPositionCaseInsensitive = FunctionsStringSearch<PositionImpl<Posit
using FunctionPositionCaseInsensitiveUTF8
= FunctionsStringSearch<PositionImpl<PositionCaseInsensitiveUTF8>, NamePositionCaseInsensitiveUTF8>;
using FunctionMultiPosition = FunctionsMultiStringPosition<MultiPositionImpl<PositionCaseSensitiveASCII>, NameMultiPosition>;
using FunctionMultiPositionUTF8 = FunctionsMultiStringPosition<MultiPositionImpl<PositionCaseSensitiveUTF8>, NameMultiPositionUTF8>;
using FunctionMultiPositionCaseInsensitive
= FunctionsMultiStringPosition<MultiPositionImpl<PositionCaseInsensitiveASCII>, NameMultiPositionCaseInsensitive>;
using FunctionMultiPositionCaseInsensitiveUTF8
= FunctionsMultiStringPosition<MultiPositionImpl<PositionCaseInsensitiveUTF8>, NameMultiPositionCaseInsensitiveUTF8>;
using FunctionMultiSearch = FunctionsMultiStringSearch<MultiSearchImpl<PositionCaseSensitiveASCII>, NameMultiSearch>;
using FunctionMultiSearchUTF8 = FunctionsMultiStringSearch<MultiSearchImpl<PositionCaseSensitiveUTF8>, NameMultiSearchUTF8>;
using FunctionMultiSearchCaseInsensitive
= FunctionsMultiStringSearch<MultiSearchImpl<PositionCaseInsensitiveASCII>, NameMultiSearchCaseInsensitive>;
using FunctionMultiSearchCaseInsensitiveUTF8
= FunctionsMultiStringSearch<MultiSearchImpl<PositionCaseInsensitiveUTF8>, NameMultiSearchCaseInsensitiveUTF8>;
using FunctionFirstMatch = FunctionsMultiStringSearch<FirstMatchImpl<PositionCaseSensitiveASCII>, NameFirstMatch>;
using FunctionFirstMatchUTF8 = FunctionsMultiStringSearch<FirstMatchImpl<PositionCaseSensitiveUTF8>, NameFirstMatchUTF8>;
using FunctionFirstMatchCaseInsensitive
= FunctionsMultiStringSearch<FirstMatchImpl<PositionCaseInsensitiveASCII>, NameFirstMatchCaseInsensitive>;
using FunctionFirstMatchCaseInsensitiveUTF8
= FunctionsMultiStringSearch<FirstMatchImpl<PositionCaseInsensitiveUTF8>, NameFirstMatchCaseInsensitiveUTF8>;
using FunctionMatch = FunctionsStringSearch<MatchImpl<false>, NameMatch>;
using FunctionLike = FunctionsStringSearch<MatchImpl<true>, NameLike>;
using FunctionNotLike = FunctionsStringSearch<MatchImpl<true, true>, NameNotLike>;
@ -1080,14 +1214,32 @@ void registerFunctionsStringSearch(FunctionFactory & factory)
factory.registerFunction<FunctionReplaceAll>();
factory.registerFunction<FunctionReplaceRegexpOne>();
factory.registerFunction<FunctionReplaceRegexpAll>();
factory.registerFunction<FunctionPosition>(FunctionFactory::CaseInsensitive);
factory.registerFunction<FunctionPositionUTF8>();
factory.registerFunction<FunctionPositionCaseInsensitive>();
factory.registerFunction<FunctionPositionCaseInsensitiveUTF8>();
factory.registerFunction<FunctionMultiPosition>();
factory.registerFunction<FunctionMultiPositionUTF8>();
factory.registerFunction<FunctionMultiPositionCaseInsensitive>();
factory.registerFunction<FunctionMultiPositionCaseInsensitiveUTF8>();
factory.registerFunction<FunctionMultiSearch>();
factory.registerFunction<FunctionMultiSearchUTF8>();
factory.registerFunction<FunctionMultiSearchCaseInsensitive>();
factory.registerFunction<FunctionMultiSearchCaseInsensitiveUTF8>();
factory.registerFunction<FunctionFirstMatch>();
factory.registerFunction<FunctionFirstMatchUTF8>();
factory.registerFunction<FunctionFirstMatchCaseInsensitive>();
factory.registerFunction<FunctionFirstMatchCaseInsensitiveUTF8>();
factory.registerFunction<FunctionMatch>();
factory.registerFunction<FunctionLike>();
factory.registerFunction<FunctionNotLike>();
factory.registerFunction<FunctionExtract>();
factory.registerAlias("locate", NamePosition::name, FunctionFactory::CaseInsensitive);
factory.registerAlias("replace", NameReplaceAll::name, FunctionFactory::CaseInsensitive);
}

View File

@ -1,17 +1,20 @@
#pragma once
#include <Columns/ColumnArray.h>
#include <Columns/ColumnConst.h>
#include <Columns/ColumnString.h>
#include <Columns/ColumnVector.h>
#include <Core/Field.h>
#include <DataTypes/DataTypeArray.h>
#include <DataTypes/DataTypeString.h>
#include <DataTypes/DataTypesNumber.h>
#include <Functions/IFunction.h>
#include <Functions/FunctionHelpers.h>
#include <Functions/IFunction.h>
#include <IO/WriteHelpers.h>
#include <common/StringRef.h>
namespace DB
{
/** Search and replace functions in strings:
*
* position(haystack, needle) - the normal search for a substring in a string, returns the position (in bytes) of the found substring starting with 1, or 0 if no substring is found.
@ -35,12 +38,28 @@ namespace DB
*
* replaceRegexpOne(haystack, pattern, replacement) - replaces the pattern with the specified regexp, only the first occurrence.
* replaceRegexpAll(haystack, pattern, replacement) - replaces the pattern with the specified type, all occurrences.
*
* multiPosition(haystack, [pattern_1, pattern_2, ..., pattern_n]) -- find first occurences (positions) of all the const patterns inside haystack
* multiPositionUTF8(haystack, [pattern_1, pattern_2, ..., pattern_n])
* multiPositionCaseInsensitive(haystack, [pattern_1, pattern_2, ..., pattern_n])
* multiPositionCaseInsensitiveUTF8(haystack, [pattern_1, pattern_2, ..., pattern_n])
*
* multiSearch(haystack, [pattern_1, pattern_2, ..., pattern_n]) -- find any of the const patterns inside haystack and return 0 or 1
* multiSearchUTF8(haystack, [pattern_1, pattern_2, ..., pattern_n])
* multiSearchCaseInsensitive(haystack, [pattern_1, pattern_2, ..., pattern_n])
* multiSearchCaseInsensitiveUTF8(haystack, [pattern_1, pattern_2, ..., pattern_n])
* firstMatch(haystack, [pattern_1, pattern_2, ..., pattern_n]) -- returns the first index of the matched string or zero if nothing was found
* firstMatchUTF8(haystack, [pattern_1, pattern_2, ..., pattern_n])
* firstMatchCaseInsensitive(haystack, [pattern_1, pattern_2, ..., pattern_n])
* firstMatchCaseInsensitiveUTF8(haystack, [pattern_1, pattern_2, ..., pattern_n])
*/
namespace ErrorCodes
{
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
extern const int ILLEGAL_COLUMN;
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
}
template <typename Impl, typename Name>
@ -48,20 +67,11 @@ class FunctionsStringSearch : public IFunction
{
public:
static constexpr auto name = Name::name;
static FunctionPtr create(const Context &)
{
return std::make_shared<FunctionsStringSearch>();
}
static FunctionPtr create(const Context &) { return std::make_shared<FunctionsStringSearch>(); }
String getName() const override
{
return name;
}
String getName() const override { return name; }
size_t getNumberOfArguments() const override
{
return 2;
}
size_t getNumberOfArguments() const override { return 2; }
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
{
@ -90,7 +100,8 @@ public:
{
ResultType res{};
Impl::constant_constant(col_haystack_const->getValue<String>(), col_needle_const->getValue<String>(), res);
block.getByPosition(result).column = block.getByPosition(result).type->createColumnConst(col_haystack_const->size(), toField(res));
block.getByPosition(result).column
= block.getByPosition(result).type->createColumnConst(col_haystack_const->size(), toField(res));
return;
}
@ -103,20 +114,22 @@ public:
const ColumnString * col_needle_vector = checkAndGetColumn<ColumnString>(&*column_needle);
if (col_haystack_vector && col_needle_vector)
Impl::vector_vector(col_haystack_vector->getChars(),
Impl::vector_vector(
col_haystack_vector->getChars(),
col_haystack_vector->getOffsets(),
col_needle_vector->getChars(),
col_needle_vector->getOffsets(),
vec_res);
else if (col_haystack_vector && col_needle_const)
Impl::vector_constant(col_haystack_vector->getChars(), col_haystack_vector->getOffsets(), col_needle_const->getValue<String>(), vec_res);
Impl::vector_constant(
col_haystack_vector->getChars(), col_haystack_vector->getOffsets(), col_needle_const->getValue<String>(), vec_res);
else if (col_haystack_const && col_needle_vector)
Impl::constant_vector(col_haystack_const->getValue<String>(), col_needle_vector->getChars(), col_needle_vector->getOffsets(), vec_res);
Impl::constant_vector(
col_haystack_const->getValue<String>(), col_needle_vector->getChars(), col_needle_vector->getOffsets(), vec_res);
else
throw Exception("Illegal columns " + block.getByPosition(arguments[0]).column->getName() + " and "
+ block.getByPosition(arguments[1]).column->getName()
+ " of arguments of function "
+ getName(),
throw Exception(
"Illegal columns " + block.getByPosition(arguments[0]).column->getName() + " and "
+ block.getByPosition(arguments[1]).column->getName() + " of arguments of function " + getName(),
ErrorCodes::ILLEGAL_COLUMN);
block.getByPosition(result).column = std::move(col_res);
@ -129,20 +142,11 @@ class FunctionsStringSearchToString : public IFunction
{
public:
static constexpr auto name = Name::name;
static FunctionPtr create(const Context &)
{
return std::make_shared<FunctionsStringSearchToString>();
}
static FunctionPtr create(const Context &) { return std::make_shared<FunctionsStringSearchToString>(); }
String getName() const override
{
return name;
}
String getName() const override { return name; }
size_t getNumberOfArguments() const override
{
return 2;
}
size_t getNumberOfArguments() const override { return 2; }
bool useDefaultImplementationForConstants() const override { return true; }
ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1}; }
@ -186,4 +190,156 @@ public:
}
};
template <typename Impl, typename Name>
class FunctionsMultiStringPosition : public IFunction
{
public:
static constexpr auto name = Name::name;
static FunctionPtr create(const Context &) { return std::make_shared<FunctionsMultiStringPosition>(); }
String getName() const override { return name; }
size_t getNumberOfArguments() const override { return 2; }
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
{
if (arguments.size() + 1 >= std::numeric_limits<UInt8>::max())
throw Exception(
"Number of arguments for function " + getName() + " doesn't match: passed " + std::to_string(arguments.size())
+ ", should be at most 255.",
ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
if (!isString(arguments[0]))
throw Exception(
"Illegal type " + arguments[0]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
const DataTypeArray * array_type = checkAndGetDataType<DataTypeArray>(arguments[1].get());
if (!array_type || !checkAndGetDataType<DataTypeString>(array_type->getNestedType().get()))
throw Exception(
"Illegal type " + arguments[1]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
return std::make_shared<DataTypeArray>(std::make_shared<DataTypeUInt64>());
}
void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t /*input_rows_count*/) override
{
using ResultType = typename Impl::ResultType;
const ColumnPtr & column_haystack = block.getByPosition(arguments[0]).column;
const ColumnString * col_haystack_vector = checkAndGetColumn<ColumnString>(&*column_haystack);
const ColumnPtr & arr_ptr = block.getByPosition(arguments[1]).column;
const ColumnConst * col_const_arr = checkAndGetColumnConst<ColumnArray>(arr_ptr.get());
if (!col_const_arr)
throw Exception(
"Illegal column " + block.getByPosition(arguments[1]).column->getName() + ". The array is not const",
ErrorCodes::ILLEGAL_COLUMN);
Array src_arr = col_const_arr->getValue<Array>();
std::vector<StringRef> refs;
for (const auto & el : src_arr)
refs.emplace_back(el.get<String>());
const size_t column_haystack_size = column_haystack->size();
auto col_res = ColumnVector<ResultType>::create();
auto col_offsets = ColumnArray::ColumnOffsets::create(column_haystack_size);
auto & vec_res = col_res->getData();
auto & offsets_res = col_offsets->getData();
vec_res.resize(column_haystack_size * refs.size());
if (col_haystack_vector)
Impl::vector_constant(col_haystack_vector->getChars(), col_haystack_vector->getOffsets(), refs, vec_res);
else
throw Exception("Illegal column " + block.getByPosition(arguments[0]).column->getName(), ErrorCodes::ILLEGAL_COLUMN);
size_t refs_size = refs.size();
size_t accum = refs_size;
for (size_t i = 0; i < column_haystack_size; ++i, accum += refs_size)
offsets_res[i] = accum;
block.getByPosition(result).column = ColumnArray::create(std::move(col_res), std::move(col_offsets));
}
};
template <typename Impl, typename Name>
class FunctionsMultiStringSearch : public IFunction
{
public:
static constexpr auto name = Name::name;
static FunctionPtr create(const Context &) { return std::make_shared<FunctionsMultiStringSearch>(); }
String getName() const override { return name; }
size_t getNumberOfArguments() const override { return 2; }
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
{
if (arguments.size() + 1 >= std::numeric_limits<UInt8>::max())
throw Exception(
"Number of arguments for function " + getName() + " doesn't match: passed " + std::to_string(arguments.size())
+ ", should be at most 255.",
ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
if (!isString(arguments[0]))
throw Exception(
"Illegal type " + arguments[0]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
const DataTypeArray * array_type = checkAndGetDataType<DataTypeArray>(arguments[1].get());
if (!array_type || !checkAndGetDataType<DataTypeString>(array_type->getNestedType().get()))
throw Exception(
"Illegal type " + arguments[1]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
return std::make_shared<DataTypeNumber<typename Impl::ResultType>>();
}
void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t /*input_rows_count*/) override
{
using ResultType = typename Impl::ResultType;
const ColumnPtr & column_haystack = block.getByPosition(arguments[0]).column;
const ColumnString * col_haystack_vector = checkAndGetColumn<ColumnString>(&*column_haystack);
const ColumnPtr & arr_ptr = block.getByPosition(arguments[1]).column;
const ColumnConst * col_const_arr = checkAndGetColumnConst<ColumnArray>(arr_ptr.get());
if (!col_const_arr)
throw Exception(
"Illegal column " + block.getByPosition(arguments[1]).column->getName() + ". The array is not const",
ErrorCodes::ILLEGAL_COLUMN);
Array src_arr = col_const_arr->getValue<Array>();
std::vector<StringRef> refs;
refs.reserve(src_arr.size());
for (const auto & el : src_arr)
refs.emplace_back(el.get<String>());
const size_t column_haystack_size = column_haystack->size();
auto col_res = ColumnVector<ResultType>::create();
auto & vec_res = col_res->getData();
vec_res.resize(column_haystack_size);
if (col_haystack_vector)
Impl::vector_constant(col_haystack_vector->getChars(), col_haystack_vector->getOffsets(), refs, vec_res);
else
throw Exception("Illegal column " + block.getByPosition(arguments[0]).column->getName(), ErrorCodes::ILLEGAL_COLUMN);
block.getByPosition(result).column = std::move(col_res);
}
};
}

View File

@ -446,13 +446,11 @@ void ActionsVisitor::visit(const ASTPtr & ast)
for (size_t j = 0; j < lambda_arg_asts.size(); ++j)
{
ASTIdentifier * lambda_identifier = typeid_cast<ASTIdentifier *>(lambda_arg_asts[j].get());
if (!lambda_identifier)
auto opt_arg_name = getIdentifierName(lambda_arg_asts[j]);
if (!opt_arg_name)
throw Exception("lambda argument declarations must be identifiers", ErrorCodes::TYPE_MISMATCH);
String arg_name = lambda_identifier->name;
lambda_arguments.emplace_back(arg_name, lambda_type->getArgumentTypes()[j]);
lambda_arguments.emplace_back(*opt_arg_name, lambda_type->getArgumentTypes()[j]);
}
actions_stack.pushLevel(lambda_arguments);
@ -541,9 +539,6 @@ void ActionsVisitor::makeSet(const ASTFunction * node, const Block & sample_bloc
const ASTIdentifier * identifier = typeid_cast<const ASTIdentifier *>(arg.get());
if (typeid_cast<const ASTSubquery *>(arg.get()) || identifier)
{
/// We get the stream of blocks for the subquery. Create Set and put it in place of the subquery.
String set_id = arg->getColumnName();
/// A special case is if the name of the table is specified on the right side of the IN statement,
/// and the table has the type Set (a previously prepared set).
if (identifier)
@ -563,6 +558,9 @@ void ActionsVisitor::makeSet(const ASTFunction * node, const Block & sample_bloc
}
}
/// We get the stream of blocks for the subquery. Create Set and put it in place of the subquery.
String set_id = arg->getColumnName();
SubqueryForSet & subquery_for_set = subqueries_for_sets[set_id];
/// If you already created a Set with the same subquery / table.

View File

@ -10,6 +10,7 @@
#include <Parsers/ASTTablesInSelectQuery.h>
#include <Parsers/ASTFunction.h>
#include <Parsers/DumpASTNode.h>
#include <Interpreters/DatabaseAndTableWithAlias.h>
namespace DB
{
@ -90,20 +91,16 @@ private:
void visit(ASTTableExpression & table_expression, ASTPtr &) const
{
if (table_expression.database_and_table_name)
{
tryVisit<ASTIdentifier>(table_expression.database_and_table_name);
if (table_expression.database_and_table_name->children.size() != 2)
throw Exception("Logical error: more than two components in table expression", ErrorCodes::LOGICAL_ERROR);
}
else if (table_expression.subquery)
tryVisit<ASTSubquery>(table_expression.subquery);
}
/// @note It expects that only table (not column) identifiers are visited.
void visit(const ASTIdentifier & identifier, ASTPtr & ast) const
{
if (ast->children.empty())
ast = createDatabaseAndTableNode(database_name, identifier.name);
if (identifier.name_parts.empty())
ast = createTableIdentifier(database_name, identifier.name);
}
void visit(ASTSubquery & subquery, ASTPtr &) const

View File

@ -67,12 +67,13 @@ Cluster::Address::Address(const Poco::Util::AbstractConfiguration & config, cons
}
Cluster::Address::Address(const String & host_port_, const String & user_, const String & password_, UInt16 clickhouse_port)
Cluster::Address::Address(const String & host_port_, const String & user_, const String & password_, UInt16 clickhouse_port, bool secure_)
: user(user_), password(password_)
{
auto parsed_host_port = parseAddress(host_port_, clickhouse_port);
host_name = parsed_host_port.first;
port = parsed_host_port.second;
secure = secure_ ? Protocol::Secure::Enable : Protocol::Secure::Disable;
initially_resolved_address = DNSResolver::instance().resolveAddress(parsed_host_port.first, parsed_host_port.second);
is_local = isLocal(*this, initially_resolved_address, clickhouse_port);
@ -319,7 +320,7 @@ Cluster::Cluster(const Poco::Util::AbstractConfiguration & config, const Setting
Cluster::Cluster(const Settings & settings, const std::vector<std::vector<String>> & names,
const String & username, const String & password, UInt16 clickhouse_port, bool treat_local_as_remote)
const String & username, const String & password, UInt16 clickhouse_port, bool treat_local_as_remote, bool secure)
{
UInt32 current_shard_num = 1;
@ -327,7 +328,7 @@ Cluster::Cluster(const Settings & settings, const std::vector<std::vector<String
{
Addresses current;
for (auto & replica : shard)
current.emplace_back(replica, username, password, clickhouse_port);
current.emplace_back(replica, username, password, clickhouse_port, secure);
addresses_with_failover.emplace_back(current);

View File

@ -24,7 +24,7 @@ public:
/// This parameter is needed only to check that some address is local (points to ourself).
Cluster(const Settings & settings, const std::vector<std::vector<String>> & names,
const String & username, const String & password,
UInt16 clickhouse_port, bool treat_local_as_remote);
UInt16 clickhouse_port, bool treat_local_as_remote, bool secure = false);
Cluster(const Cluster &) = delete;
Cluster & operator=(const Cluster &) = delete;
@ -69,7 +69,7 @@ public:
Address() = default;
Address(const Poco::Util::AbstractConfiguration & config, const String & config_prefix);
Address(const String & host_port_, const String & user_, const String & password_, UInt16 clickhouse_port);
Address(const String & host_port_, const String & user_, const String & password_, UInt16 clickhouse_port, bool secure_ = false);
/// Returns 'escaped_host_name:port'
String toString() const;

View File

@ -13,7 +13,7 @@ namespace DB
/// Checks that ast is ASTIdentifier and remove num_qualifiers_to_strip components from left.
/// Example: 'database.table.name' -> (num_qualifiers_to_strip = 2) -> 'name'.
void stripIdentifier(DB::ASTPtr & ast, size_t num_qualifiers_to_strip)
void stripIdentifier(const DB::ASTPtr & ast, size_t num_qualifiers_to_strip)
{
ASTIdentifier * identifier = typeid_cast<ASTIdentifier *>(ast.get());
@ -22,29 +22,15 @@ void stripIdentifier(DB::ASTPtr & ast, size_t num_qualifiers_to_strip)
if (num_qualifiers_to_strip)
{
size_t num_components = identifier->children.size();
/// plain column
if (num_components - num_qualifiers_to_strip == 1)
identifier->name_parts.erase(identifier->name_parts.begin(), identifier->name_parts.begin() + num_qualifiers_to_strip);
DB::String new_name;
for (const auto & part : identifier->name_parts)
{
DB::String node_alias = identifier->tryGetAlias();
ast = identifier->children.back();
if (!node_alias.empty())
ast->setAlias(node_alias);
}
else
/// nested column
{
identifier->children.erase(identifier->children.begin(), identifier->children.begin() + num_qualifiers_to_strip);
DB::String new_name;
for (const auto & child : identifier->children)
{
if (!new_name.empty())
new_name += '.';
new_name += static_cast<const ASTIdentifier &>(*child.get()).name;
}
identifier->name = new_name;
if (!new_name.empty())
new_name += '.';
new_name += part;
}
identifier->name.swap(new_name);
}
}
@ -52,32 +38,16 @@ void stripIdentifier(DB::ASTPtr & ast, size_t num_qualifiers_to_strip)
size_t getNumComponentsToStripInOrderToTranslateQualifiedName(const ASTIdentifier & identifier,
const DatabaseAndTableWithAlias & names)
{
size_t num_qualifiers_to_strip = 0;
/// database.table.column
if (doesIdentifierBelongTo(identifier, names.database, names.table))
return 2;
/// It is compound identifier
if (!identifier.children.empty())
{
size_t num_components = identifier.children.size();
/// table.column or alias.column.
if (doesIdentifierBelongTo(identifier, names.table) ||
doesIdentifierBelongTo(identifier, names.alias))
return 1;
/// database.table.column
if (num_components >= 3
&& !names.database.empty()
&& *getIdentifierName(identifier.children[0]) == names.database
&& *getIdentifierName(identifier.children[1]) == names.table)
{
num_qualifiers_to_strip = 2;
}
/// table.column or alias.column. If num_components > 2, it is like table.nested.column.
if (num_components >= 2
&& ((!names.table.empty() && *getIdentifierName(identifier.children[0]) == names.table)
|| (!names.alias.empty() && *getIdentifierName(identifier.children[0]) == names.alias)))
{
num_qualifiers_to_strip = 1;
}
}
return num_qualifiers_to_strip;
return 0;
}
@ -87,13 +57,13 @@ DatabaseAndTableWithAlias::DatabaseAndTableWithAlias(const ASTIdentifier & ident
table = identifier.name;
alias = identifier.tryGetAlias();
if (!identifier.children.empty())
if (!identifier.name_parts.empty())
{
if (identifier.children.size() != 2)
throw Exception("Logical error: number of components in table expression not equal to two", ErrorCodes::LOGICAL_ERROR);
if (identifier.name_parts.size() != 2)
throw Exception("Logical error: 2 components expected in table expression '" + identifier.name + "'", ErrorCodes::LOGICAL_ERROR);
getIdentifierName(identifier.children[0], database);
getIdentifierName(identifier.children[1], table);
database = identifier.name_parts[0];
table = identifier.name_parts[1];
}
}
@ -118,6 +88,22 @@ DatabaseAndTableWithAlias::DatabaseAndTableWithAlias(const ASTTableExpression &
throw Exception("Logical error: no known elements in ASTTableExpression", ErrorCodes::LOGICAL_ERROR);
}
bool DatabaseAndTableWithAlias::satisfies(const DatabaseAndTableWithAlias & db_table, bool table_may_be_an_alias)
{
/// table.*, alias.* or database.table.*
if (database.empty())
{
if (!db_table.table.empty() && table == db_table.table)
return true;
if (!db_table.alias.empty())
return (alias == db_table.alias) || (table_may_be_an_alias && table == db_table.alias);
}
return database == db_table.database && table == db_table.table;
}
String DatabaseAndTableWithAlias::getQualifiedNamePrefix() const
{
if (alias.empty() && table.empty())
@ -133,17 +119,7 @@ void DatabaseAndTableWithAlias::makeQualifiedName(const ASTPtr & ast) const
String prefix = getQualifiedNamePrefix();
identifier->name.insert(identifier->name.begin(), prefix.begin(), prefix.end());
Names qualifiers;
if (!alias.empty())
qualifiers.push_back(alias);
else
{
qualifiers.push_back(database);
qualifiers.push_back(table);
}
for (const auto & qualifier : qualifiers)
identifier->children.emplace_back(std::make_shared<ASTIdentifier>(qualifier));
addIdentifierQualifier(*identifier, database, table, alias);
}
}
@ -209,21 +185,13 @@ std::optional<DatabaseAndTableWithAlias> getDatabaseAndTable(const ASTSelectQuer
return DatabaseAndTableWithAlias(database_and_table_name);
}
ASTPtr getTableFunctionOrSubquery(const ASTSelectQuery & select, size_t table_number)
ASTPtr extractTableExpression(const ASTSelectQuery & select, size_t table_number)
{
const ASTTableExpression * table_expression = getTableExpression(select, table_number);
if (table_expression)
if (const ASTTableExpression * table_expression = getTableExpression(select, table_number))
{
#if 1 /// TODO: It hides some logical error in InterpreterSelectQuery & distributed tables
if (table_expression->database_and_table_name)
{
if (table_expression->database_and_table_name->children.empty())
return table_expression->database_and_table_name;
return table_expression->database_and_table_name;
if (table_expression->database_and_table_name->children.size() == 2)
return table_expression->database_and_table_name->children[1];
}
#endif
if (table_expression->table_function)
return table_expression->table_function;

View File

@ -2,8 +2,9 @@
#include <memory>
#include <optional>
#include <Core/Types.h>
#include <Parsers/ASTSelectQuery.h>
namespace DB
{
@ -33,9 +34,12 @@ struct DatabaseAndTableWithAlias
/// If ast is ASTIdentifier, prepend getQualifiedNamePrefix() to it's name.
void makeQualifiedName(const ASTPtr & ast) const;
/// Check if it satisfies another db_table name. @note opterion is not symmetric.
bool satisfies(const DatabaseAndTableWithAlias & table, bool table_may_be_an_alias);
};
void stripIdentifier(DB::ASTPtr & ast, size_t num_qualifiers_to_strip);
void stripIdentifier(const DB::ASTPtr & ast, size_t num_qualifiers_to_strip);
size_t getNumComponentsToStripInOrderToTranslateQualifiedName(const ASTIdentifier & identifier,
const DatabaseAndTableWithAlias & names);
@ -44,6 +48,6 @@ std::vector<DatabaseAndTableWithAlias> getDatabaseAndTables(const ASTSelectQuery
std::optional<DatabaseAndTableWithAlias> getDatabaseAndTable(const ASTSelectQuery & select, size_t table_number);
std::vector<const ASTTableExpression *> getSelectTablesExpression(const ASTSelectQuery & select_query);
ASTPtr getTableFunctionOrSubquery(const ASTSelectQuery & select, size_t table_number);
ASTPtr extractTableExpression(const ASTSelectQuery & select, size_t table_number);
}

View File

@ -1,6 +1,7 @@
#include <Parsers/ASTFunction.h>
#include <Parsers/ASTLiteral.h>
#include <Parsers/ASTSubquery.h>
#include <Parsers/ASTSelectQuery.h>
#include <Parsers/ASTTablesInSelectQuery.h>
#include <Parsers/ASTExpressionList.h>

View File

@ -310,7 +310,7 @@ void ExpressionAnalyzer::makeSetsForIndexImpl(const ASTPtr & node, const Block &
if (!prepared_sets.count(arg->range)) /// Not already prepared.
{
if (typeid_cast<ASTSubquery *>(arg.get()) || typeid_cast<ASTIdentifier *>(arg.get()))
if (typeid_cast<ASTSubquery *>(arg.get()) || isIdentifier(arg))
{
if (settings.use_index_for_in_with_subqueries)
tryMakeSetForIndexFromSubquery(arg);

View File

@ -55,7 +55,7 @@ public:
ASTPtr table_name;
ASTPtr subquery_or_table_name;
if (typeid_cast<const ASTIdentifier *>(subquery_or_table_name_or_table_expression.get()))
if (isIdentifier(subquery_or_table_name_or_table_expression))
{
table_name = subquery_or_table_name_or_table_expression;
subquery_or_table_name = table_name;
@ -86,7 +86,7 @@ public:
if (table_name)
{
/// If this is already an external table, you do not need to add anything. Just remember its presence.
if (external_tables.end() != external_tables.find(static_cast<const ASTIdentifier &>(*table_name).name))
if (external_tables.end() != external_tables.find(*getIdentifierName(table_name)))
return;
}
@ -112,7 +112,7 @@ public:
* instead of doing a subquery, you just need to read it.
*/
auto database_and_table_name = createDatabaseAndTableNode("", external_table_name);
auto database_and_table_name = createTableIdentifier("", external_table_name);
if (auto ast_table_expr = typeid_cast<ASTTableExpression *>(subquery_or_table_name_or_table_expression.get()))
{

View File

@ -2,6 +2,7 @@
#include <Interpreters/Context.h>
#include <Interpreters/DatabaseAndTableWithAlias.h>
#include <Storages/StorageDistributed.h>
#include <Parsers/ASTIdentifier.h>
#include <Parsers/ASTSelectQuery.h>
#include <Parsers/ASTTablesInSelectQuery.h>
#include <Parsers/ASTFunction.h>
@ -167,8 +168,7 @@ void InJoinSubqueriesPreprocessor::process(ASTSelectQuery * query) const
std::string table;
std::tie(database, table) = getRemoteDatabaseAndTableName(*storage);
/// TODO: find a way to avoid AST node replacing
database_and_table = createDatabaseAndTableNode(database, table);
database_and_table = createTableIdentifier(database, table);
}
else
throw Exception("InJoinSubqueriesPreprocessor: unexpected value of 'distributed_product_mode' setting", ErrorCodes::LOGICAL_ERROR);

View File

@ -195,9 +195,11 @@ static ColumnsDeclarationAndModifiers parseColumns(const ASTExpressionList & col
{
auto & col_decl = typeid_cast<ASTColumnDeclaration &>(*ast);
DataTypePtr column_type = nullptr;
if (col_decl.type)
{
columns.emplace_back(col_decl.name, DataTypeFactory::instance().get(col_decl.type));
column_type = DataTypeFactory::instance().get(col_decl.type);
columns.emplace_back(col_decl.name, column_type);
}
else
/// we're creating dummy DataTypeUInt8 in order to prevent the NullPointerException in ExpressionActions
@ -228,7 +230,7 @@ static ColumnsDeclarationAndModifiers parseColumns(const ASTExpressionList & col
if (col_decl.codec)
{
auto codec = CompressionCodecFactory::instance().get(col_decl.codec);
auto codec = CompressionCodecFactory::instance().get(col_decl.codec, column_type);
codecs.emplace(col_decl.name, codec);
}

View File

@ -86,20 +86,17 @@ BlockInputStreamPtr InterpreterDescribeQuery::executeImpl()
String database_name;
String table_name;
auto identifier = table_expression->database_and_table_name;
if (identifier->children.size() > 2)
auto identifier = typeid_cast<const ASTIdentifier *>(table_expression->database_and_table_name.get());
if (identifier->name_parts.size() > 2)
throw Exception("Logical error: more than two components in table expression", ErrorCodes::LOGICAL_ERROR);
if (identifier->children.size() > 1)
if (identifier->name_parts.size() > 1)
{
auto database_ptr = identifier->children[0];
auto table_ptr = identifier->children[1];
getIdentifierName(database_ptr, database_name);
getIdentifierName(table_ptr, table_name);
database_name = identifier->name_parts[0];
table_name = identifier->name_parts[1];
}
else
getIdentifierName(identifier, table_name);
table_name = identifier->name;
table = context.getTable(database_name, table_name);
}

View File

@ -147,14 +147,22 @@ InterpreterSelectQuery::InterpreterSelectQuery(
max_streams = settings.max_threads;
ASTPtr table_expression = getTableFunctionOrSubquery(query, 0);
ASTPtr table_expression = extractTableExpression(query, 0);
bool is_table_func = false;
bool is_subquery = false;
if (table_expression)
{
is_table_func = typeid_cast<const ASTFunction *>(table_expression.get());
is_subquery = typeid_cast<const ASTSelectWithUnionQuery *>(table_expression.get());
}
if (input)
{
/// Read from prepared input.
source_header = input->getHeader();
}
else if (table_expression && typeid_cast<const ASTSelectWithUnionQuery *>(table_expression.get()))
else if (is_subquery)
{
/// Read from subquery.
interpreter_subquery = std::make_unique<InterpreterSelectWithUnionQuery>(
@ -164,7 +172,7 @@ InterpreterSelectQuery::InterpreterSelectQuery(
}
else if (!storage)
{
if (table_expression && typeid_cast<const ASTFunction *>(table_expression.get()))
if (is_table_func)
{
/// Read from table function.
storage = context.getQueryContext().executeTableFunction(table_expression);
@ -208,7 +216,7 @@ InterpreterSelectQuery::InterpreterSelectQuery(
if (query_analyzer->isRewriteSubqueriesPredicate())
{
/// remake interpreter_subquery when PredicateOptimizer is rewrite subqueries and main table is subquery
if (table_expression && typeid_cast<ASTSelectWithUnionQuery *>(table_expression.get()))
if (is_subquery)
interpreter_subquery = std::make_unique<InterpreterSelectWithUnionQuery>(
table_expression, getSubqueryContext(context), required_columns, QueryProcessingStage::Complete, subquery_depth + 1,
only_analyze);
@ -606,8 +614,13 @@ void InterpreterSelectQuery::executeImpl(Pipeline & pipeline, const BlockInputSt
executeRollupOrCube(pipeline, Modificator::ROLLUP);
else if (query.group_by_with_cube)
executeRollupOrCube(pipeline, Modificator::CUBE);
if ((query.group_by_with_rollup || query.group_by_with_cube) && expressions.has_having)
{
if (query.group_by_with_totals)
throw Exception("WITH TOTALS and WITH ROLLUP or CUBE are not supported together in presence of HAVING", ErrorCodes::NOT_IMPLEMENTED);
executeHaving(pipeline, expressions.before_having);
}
}
else if (expressions.has_having)
executeHaving(pipeline, expressions.before_having);
@ -633,8 +646,13 @@ void InterpreterSelectQuery::executeImpl(Pipeline & pipeline, const BlockInputSt
executeRollupOrCube(pipeline, Modificator::ROLLUP);
else if (query.group_by_with_cube)
executeRollupOrCube(pipeline, Modificator::CUBE);
if (expressions.has_having)
{
if (query.group_by_with_totals)
throw Exception("WITH TOTALS and WITH ROLLUP or CUBE are not supported together in presence of HAVING", ErrorCodes::NOT_IMPLEMENTED);
executeHaving(pipeline, expressions.before_having);
}
}
}
@ -911,7 +929,7 @@ void InterpreterSelectQuery::executeFetchColumns(
/// If we need less number of columns that subquery have - update the interpreter.
if (required_columns.size() < source_header.columns())
{
ASTPtr subquery = getTableFunctionOrSubquery(query, 0);
ASTPtr subquery = extractTableExpression(query, 0);
if (!subquery)
throw Exception("Subquery expected", ErrorCodes::LOGICAL_ERROR);
@ -1386,7 +1404,7 @@ bool hasWithTotalsInAnySubqueryInFromClause(const ASTSelectQuery & query)
* In other cases, totals will be computed on the initiating server of the query, and it is not necessary to read the data to the end.
*/
if (auto query_table = getTableFunctionOrSubquery(query, 0))
if (auto query_table = extractTableExpression(query, 0))
{
if (auto ast_union = typeid_cast<const ASTSelectWithUnionQuery *>(query_table.get()))
{

View File

@ -1,6 +1,5 @@
#include <Common/typeid_cast.h>
#include <Interpreters/JoinToSubqueryTransformVisitor.h>
#include <Interpreters/SemanticSelectQuery.h>
#include <Parsers/ASTSelectQuery.h>
#include <Parsers/ASTTablesInSelectQuery.h>
#include <Parsers/ASTIdentifier.h>
@ -19,6 +18,7 @@ namespace ErrorCodes
extern const int TOO_DEEP_AST;
}
#if 0
/// Attach additional semantic info to generated select.
struct AppendSemanticVisitorData
{
@ -35,6 +35,7 @@ struct AppendSemanticVisitorData
done = true;
}
};
#endif
/// Replaces one table element with pair.
struct RewriteTablesVisitorData
@ -124,7 +125,7 @@ void JoinToSubqueryTransformMatcher::visit(ASTSelectQuery & select, ASTPtr & ast
if (!left)
return;
SemanticSelectQuery::hideNames(select, hidden_names, subquery_name);
//SemanticSelectQuery::hideNames(select, hidden_names, subquery_name);
}
select.tables = std::make_shared<ASTTablesInSelectQuery>();
@ -135,11 +136,15 @@ void JoinToSubqueryTransformMatcher::visit(ASTSelectQuery & select, ASTPtr & ast
data.done = true;
}
ASTPtr JoinToSubqueryTransformMatcher::replaceJoin(ASTSelectQuery & select, ASTPtr ast_left, ASTPtr ast_right, const String & subquery_alias)
ASTPtr JoinToSubqueryTransformMatcher::replaceJoin(ASTSelectQuery &, ASTPtr ast_left, ASTPtr ast_right, const String & subquery_alias)
{
#if 0
using RewriteMatcher = LinkedMatcher<
OneTypeMatcher<RewriteTablesVisitorData>,
OneTypeMatcher<AppendSemanticVisitorData>>;
#else
using RewriteMatcher = OneTypeMatcher<RewriteTablesVisitorData>;
#endif
using RewriteVisitor = InDepthNodeVisitor<RewriteMatcher, true>;
auto left = typeid_cast<const ASTTablesInSelectQueryElement *>(ast_left.get());
@ -160,8 +165,12 @@ ASTPtr JoinToSubqueryTransformMatcher::replaceJoin(ASTSelectQuery & select, ASTP
if (!res)
throw Exception("Cannot parse rewrite query", ErrorCodes::LOGICAL_ERROR);
#if 0
RewriteVisitor::Data visitor_data =
std::make_pair<RewriteTablesVisitorData, AppendSemanticVisitorData>({ast_left, ast_right}, {select.semantic});
#else
RewriteVisitor::Data visitor_data{ast_left, ast_right};
#endif
RewriteVisitor(visitor_data).visit(res);
return res;
}

View File

@ -133,8 +133,12 @@ void PredicateExpressionsOptimizer::getDependenciesAndQualifiedOfExpression(cons
{
if (const auto identifier = typeid_cast<ASTIdentifier *>(expression.get()))
{
if (!identifier->children.empty())
dependencies_and_qualified.emplace_back(std::pair(identifier, expression->getAliasOrColumnName()));
String table_alias;
if (!identifier->name_parts.empty())
{
if (!tables_with_aliases.empty())
table_alias = tables_with_aliases[0].getQualifiedNamePrefix();
}
else
{
size_t best_table_pos = 0;
@ -153,9 +157,11 @@ void PredicateExpressionsOptimizer::getDependenciesAndQualifiedOfExpression(cons
}
}
String qualified_name = tables_with_aliases[best_table_pos].getQualifiedNamePrefix() + expression->getAliasOrColumnName();
dependencies_and_qualified.emplace_back(std::pair(identifier, qualified_name));
table_alias = tables_with_aliases[best_table_pos].getQualifiedNamePrefix();
}
String qualified_name = table_alias + expression->getAliasOrColumnName();
dependencies_and_qualified.emplace_back(std::pair(identifier, qualified_name));
}
else
{
@ -356,31 +362,17 @@ ASTs PredicateExpressionsOptimizer::evaluateAsterisk(ASTSelectQuery * select_que
if (qualified_asterisk->children.size() != 1)
throw Exception("Logical error: qualified asterisk must have exactly one child", ErrorCodes::LOGICAL_ERROR);
ASTIdentifier * ident = typeid_cast<ASTIdentifier *>(qualified_asterisk->children[0].get());
if (!ident)
throw Exception("Logical error: qualified asterisk must have identifier as its child", ErrorCodes::LOGICAL_ERROR);
DatabaseAndTableWithAlias ident_db_and_name(qualified_asterisk->children[0]);
size_t num_components = ident->children.size();
if (num_components > 2)
throw Exception("Qualified asterisk cannot have more than two qualifiers", ErrorCodes::UNKNOWN_ELEMENT_IN_AST);
for (auto it = tables_expression.begin(); it != tables_expression.end(); ++it)
for (auto it = tables_expression.begin(); it != tables_expression.end();)
{
const ASTTableExpression * table_expression = *it;
DatabaseAndTableWithAlias database_and_table_with_alias(*table_expression, context.getCurrentDatabase());
/// database.table.*
if (num_components == 2 && !database_and_table_with_alias.database.empty()
&& static_cast<const ASTIdentifier &>(*ident->children[0]).name == database_and_table_with_alias.database
&& static_cast<const ASTIdentifier &>(*ident->children[1]).name == database_and_table_with_alias.table)
continue;
/// table.* or alias.*
else if (num_components == 0
&& ((!database_and_table_with_alias.table.empty() && ident->name == database_and_table_with_alias.table)
|| (!database_and_table_with_alias.alias.empty() && ident->name == database_and_table_with_alias.alias)))
continue;
if (ident_db_and_name.satisfies(database_and_table_with_alias, true))
++it;
else
/// It's not a required table
tables_expression.erase(it);
it = tables_expression.erase(it); /// It's not a required table
}
}

View File

@ -168,18 +168,11 @@ void QueryNormalizer::visit(ASTExpressionList & node, const ASTPtr &, Data & dat
}
else if (const auto * qualified_asterisk = typeid_cast<const ASTQualifiedAsterisk *>(child.get()))
{
const ASTIdentifier * identifier = typeid_cast<const ASTIdentifier *>(qualified_asterisk->children[0].get());
size_t num_components = identifier->children.size();
DatabaseAndTableWithAlias ident_db_and_name(qualified_asterisk->children[0]);
for (const auto & [table_name, table_columns] : tables_with_columns)
{
if ((num_components == 2 /// database.table.*
&& !table_name.database.empty() /// This is normal (not a temporary) table.
&& static_cast<const ASTIdentifier &>(*identifier->children[0]).name == table_name.database
&& static_cast<const ASTIdentifier &>(*identifier->children[1]).name == table_name.table)
|| (num_components == 0 /// t.*
&& ((!table_name.table.empty() && identifier->name == table_name.table) /// table.*
|| (!table_name.alias.empty() && identifier->name == table_name.alias)))) /// alias.*
if (ident_db_and_name.satisfies(table_name, true))
{
for (const auto & column_name : table_columns)
node.children.emplace_back(std::make_shared<ASTIdentifier>(column_name));

View File

@ -1,43 +0,0 @@
#pragma once
#include <Parsers/IAST.h>
#include <Parsers/ASTSelectQuery.h>
namespace DB
{
/// Additional information for ASTSelectQuery
class SemanticSelectQuery : public ISemantic
{
public:
SemanticPtr clone() const override { return std::make_shared<SemanticSelectQuery>(*this); }
std::vector<String> getPossibleNames(const String & name) const
{
std::vector<String> res;
res.push_back(name);
for (auto it = hidings.find(name); it != hidings.end(); it = hidings.find(it->second))
res.push_back(it->second);
return res;
}
static void hideNames(ASTSelectQuery & select, const std::vector<String> & hidden, const String & new_name)
{
if (!select.semantic)
select.semantic = std::make_shared<SemanticSelectQuery>();
auto & sema = static_cast<SemanticSelectQuery &>(*select.semantic);
sema.hideNames(hidden, new_name);
}
private:
std::unordered_map<String, String> hidings;
void hideNames(const std::vector<String> & hidden, const String & new_name)
{
for (auto & name : hidden)
hidings.emplace(name, new_name);
}
};
}

View File

@ -89,38 +89,17 @@ std::vector<ASTPtr *> TranslateQualifiedNamesMatcher::visit(const ASTIdentifier
std::vector<ASTPtr *> TranslateQualifiedNamesMatcher::visit(const ASTQualifiedAsterisk & , const ASTPtr & ast, Data & data)
{
const std::vector<DatabaseAndTableWithAlias> & tables = data.tables;
if (ast->children.size() != 1)
throw Exception("Logical error: qualified asterisk must have exactly one child", ErrorCodes::LOGICAL_ERROR);
ASTIdentifier * ident = typeid_cast<ASTIdentifier *>(ast->children[0].get());
if (!ident)
throw Exception("Logical error: qualified asterisk must have identifier as its child", ErrorCodes::LOGICAL_ERROR);
auto & ident = ast->children[0];
size_t num_components = ident->children.size();
if (num_components > 2)
throw Exception("Qualified asterisk cannot have more than two qualifiers", ErrorCodes::UNKNOWN_ELEMENT_IN_AST);
/// @note it could contain table alias as table name.
DatabaseAndTableWithAlias db_and_table(ident);
DatabaseAndTableWithAlias db_and_table(*ident);
for (const auto & table_names : tables)
{
/// database.table.*, table.* or alias.*
if (num_components == 2)
{
if (!table_names.database.empty() &&
db_and_table.database == table_names.database &&
db_and_table.table == table_names.table)
return {};
}
else if (num_components == 0)
{
if ((!table_names.table.empty() && db_and_table.table == table_names.table) ||
(!table_names.alias.empty() && db_and_table.table == table_names.alias))
return {};
}
}
for (const auto & known_table : data.tables)
if (db_and_table.satisfies(known_table, true))
return {};
throw Exception("Unknown qualified identifier: " + ident->getAliasOrColumnName(), ErrorCodes::UNKNOWN_IDENTIFIER);
}

View File

@ -2,6 +2,7 @@
#include <vector>
#include <Core/Names.h>
#include <Interpreters/DatabaseAndTableWithAlias.h>
#include <Interpreters/InDepthNodeVisitor.h>

View File

@ -18,14 +18,14 @@ void ASTIdentifier::formatImplWithoutAlias(const FormatSettings & settings, Form
/// A simple or compound identifier?
if (children.size() > 1)
if (name_parts.size() > 1)
{
for (size_t i = 0, size = children.size(); i < size; ++i)
for (size_t i = 0, size = name_parts.size(); i < size; ++i)
{
if (i != 0)
settings.ostr << '.';
format_element(static_cast<const ASTIdentifier &>(*children[i].get()).name);
format_element(name_parts[i]);
}
}
else
@ -39,6 +39,15 @@ void ASTIdentifier::appendColumnNameImpl(WriteBuffer & ostr) const
writeString(name, ostr);
}
ASTPtr createTableIdentifier(const String & database_name, const String & table_name)
{
if (database_name.empty())
return ASTIdentifier::createSpecial(table_name);
ASTPtr database_and_table = ASTIdentifier::createSpecial(database_name + "." + table_name, {database_name, table_name});
return database_and_table;
}
bool isIdentifier(const IAST * const ast)
{
if (ast)
@ -67,7 +76,7 @@ bool getIdentifierName(const ASTPtr & ast, String & name)
std::optional<String> getColumnIdentifierName(const ASTIdentifier & node)
{
if (!node.special())
if (!node.special)
return node.name;
return {};
}
@ -76,14 +85,14 @@ std::optional<String> getColumnIdentifierName(const ASTPtr & ast)
{
if (ast)
if (auto id = typeid_cast<const ASTIdentifier *>(ast.get()))
if (!id->special())
if (!id->special)
return id->name;
return {};
}
std::optional<String> getTableIdentifierName(const ASTIdentifier & node)
{
if (node.special())
if (node.special)
return node.name;
return {};
}
@ -92,7 +101,7 @@ std::optional<String> getTableIdentifierName(const ASTPtr & ast)
{
if (ast)
if (auto id = typeid_cast<const ASTIdentifier *>(ast.get()))
if (id->special())
if (id->special)
return id->name;
return {};
}
@ -104,4 +113,35 @@ void setIdentifierSpecial(ASTPtr & ast)
id->setSpecial();
}
void addIdentifierQualifier(ASTIdentifier & identifier, const String & database, const String & table, const String & alias)
{
if (!alias.empty())
{
identifier.name_parts.emplace_back(alias);
}
else
{
if (!database.empty())
identifier.name_parts.emplace_back(database);
identifier.name_parts.emplace_back(table);
}
}
bool doesIdentifierBelongTo(const ASTIdentifier & identifier, const String & database, const String & table)
{
size_t num_components = identifier.name_parts.size();
if (num_components >= 3)
return identifier.name_parts[0] == database &&
identifier.name_parts[1] == table;
return false;
}
bool doesIdentifierBelongTo(const ASTIdentifier & identifier, const String & table)
{
size_t num_components = identifier.name_parts.size();
if (num_components >= 2)
return identifier.name_parts[0] == table;
return false;
}
}

View File

@ -11,18 +11,19 @@ namespace DB
/// Identifier (column, table or alias)
class ASTIdentifier : public ASTWithAlias
{
enum Kind /// TODO This is semantic, not syntax. Remove it.
{
General,
Special, // Database, Table, Format
};
public:
/// name. The composite identifier here will have a concatenated name (of the form a.b.c), and individual components will be available inside the children.
/// The composite identifier will have a concatenated name (of the form a.b.c),
/// and individual components will be available inside the name_parts.
String name;
std::vector<String> name_parts;
ASTIdentifier(const String & name_, const Kind kind_ = General)
: name(name_), kind(kind_) { range = StringRange(name.data(), name.data() + name.size()); }
ASTIdentifier(const String & name_, std::vector<String> && name_parts_ = {})
: name(name_)
, name_parts(name_parts_)
, special(false)
{
range = StringRange(name.data(), name.data() + name.size());
}
/** Get the text that identifies this element. */
String getID(char delim) const override { return "Identifier" + (delim + name); }
@ -34,31 +35,37 @@ public:
set.insert(name);
}
static std::shared_ptr<ASTIdentifier> createSpecial(const String & name_)
{
return std::make_shared<ASTIdentifier>(name_, ASTIdentifier::Special);
}
protected:
void formatImplWithoutAlias(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const override;
void appendColumnNameImpl(WriteBuffer & ostr) const override;
private:
Kind kind;
using ASTWithAlias::children; /// ASTIdentifier is child free
void setSpecial() { kind = Special; }
bool special() const { return kind == Special; }
bool special; /// TODO: it would be ptr to semantic here
static std::shared_ptr<ASTIdentifier> createSpecial(const String & name, std::vector<String> && name_parts = {})
{
auto ret = std::make_shared<ASTIdentifier>(name, std::move(name_parts));
ret->special = true;
return ret;
}
void setSpecial() { special = true; }
friend void setIdentifierSpecial(ASTPtr &);
friend std::optional<String> getColumnIdentifierName(const ASTIdentifier & node);
friend std::optional<String> getColumnIdentifierName(const ASTPtr & ast);
friend std::optional<String> getTableIdentifierName(const ASTIdentifier & node);
friend std::optional<String> getTableIdentifierName(const ASTPtr & ast);
friend ASTPtr createTableIdentifier(const String & database_name, const String & table_name);
};
/// ASTIdentifier Helpers: hide casts and semantic.
ASTPtr createTableIdentifier(const String & database_name, const String & table_name);
bool isIdentifier(const IAST * const ast);
inline bool isIdentifier(const ASTPtr & ast) { return isIdentifier(ast.get()); }
@ -75,5 +82,8 @@ std::optional<String> getTableIdentifierName(const ASTIdentifier & node);
std::optional<String> getTableIdentifierName(const ASTPtr & ast);
void setIdentifierSpecial(ASTPtr & ast);
void addIdentifierQualifier(ASTIdentifier & identifier, const String & database, const String & table, const String & alias);
bool doesIdentifierBelongTo(const ASTIdentifier & identifier, const String & table_or_alias);
bool doesIdentifierBelongTo(const ASTIdentifier & identifier, const String & database, const String & table);
}

View File

@ -17,19 +17,6 @@ namespace ErrorCodes
extern const int NOT_IMPLEMENTED;
}
ASTPtr createDatabaseAndTableNode(const String & database_name, const String & table_name)
{
if (database_name.empty())
return ASTIdentifier::createSpecial(table_name);
ASTPtr database = ASTIdentifier::createSpecial(database_name);
ASTPtr table = ASTIdentifier::createSpecial(table_name);
ASTPtr database_and_table = ASTIdentifier::createSpecial(database_name + "." + table_name);
database_and_table->children = {database, table};
return database_and_table;
}
ASTPtr ASTSelectQuery::clone() const
{
@ -64,8 +51,6 @@ ASTPtr ASTSelectQuery::clone() const
#undef CLONE
if (semantic)
res->semantic = semantic->clone();
return res;
}
@ -338,7 +323,7 @@ void ASTSelectQuery::replaceDatabaseAndTable(const String & database_name, const
}
String table_alias = getTableExpressionAlias(table_expression);
table_expression->database_and_table_name = createDatabaseAndTableNode(database_name, table_name);
table_expression->database_and_table_name = createTableIdentifier(database_name, table_name);
if (!table_alias.empty())
table_expression->database_and_table_name->setAlias(table_alias);

View File

@ -52,7 +52,4 @@ protected:
void formatImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const override;
};
ASTPtr createDatabaseAndTableNode(const String & database_name, const String & table_name);
}

View File

@ -169,19 +169,19 @@ bool ParserCompoundIdentifier::parseImpl(Pos & pos, ASTPtr & node, Expected & ex
return false;
String name;
std::vector<String> parts;
const ASTExpressionList & list = static_cast<const ASTExpressionList &>(*id_list.get());
for (const auto & child : list.children)
{
if (!name.empty())
name += '.';
name += *getIdentifierName(child);
parts.emplace_back(*getIdentifierName(child));
name += parts.back();
}
node = std::make_shared<ASTIdentifier>(name);
/// In `children`, remember the identifiers-components, if there are more than one.
if (list.children.size() > 1)
node->children.insert(node->children.end(), list.children.begin(), list.children.end());
if (parts.size() == 1)
parts.clear();
node = std::make_shared<ASTIdentifier>(name, std::move(parts));
return true;
}

View File

@ -31,20 +31,6 @@ class IAST;
using ASTPtr = std::shared_ptr<IAST>;
using ASTs = std::vector<ASTPtr>;
class ISemantic;
using SemanticPtr = std::shared_ptr<ISemantic>;
/// Interfase to set additional information to IAST. Derived classes should be named according to their AST nodes' types:
/// ASTIdentifier => SemanticIdentifer, ASTSome => SemanticSome, ...
class ISemantic
{
public:
virtual ~ISemantic() = default;
ISemantic() = default;
ISemantic(const ISemantic &) = default;
virtual SemanticPtr clone() const = 0;
};
class WriteBuffer;
@ -58,7 +44,6 @@ public:
/// This pointer does not allow it to be deleted while the range refers to it.
StringPtr owned_string;
SemanticPtr semantic;
virtual ~IAST() = default;
IAST() = default;

View File

@ -14,6 +14,7 @@
#include <Parsers/ASTAlterQuery.h>
#include <Parsers/ASTColumnDeclaration.h>
#include <Common/typeid_cast.h>
#include <Compression/CompressionFactory.h>
namespace DB
@ -30,6 +31,7 @@ namespace ErrorCodes
std::optional<AlterCommand> AlterCommand::parse(const ASTAlterCommand * command_ast)
{
const DataTypeFactory & data_type_factory = DataTypeFactory::instance();
const CompressionCodecFactory & compression_codec_factory = CompressionCodecFactory::instance();
if (command_ast->type == ASTAlterCommand::ADD_COLUMN)
{
@ -49,6 +51,9 @@ std::optional<AlterCommand> AlterCommand::parse(const ASTAlterCommand * command_
command.default_expression = ast_col_decl.default_expression;
}
if (ast_col_decl.codec)
command.codec = compression_codec_factory.get(ast_col_decl.codec);
if (command_ast->column)
command.after_column = *getIdentifierName(command_ast->column);
@ -86,6 +91,9 @@ std::optional<AlterCommand> AlterCommand::parse(const ASTAlterCommand * command_
command.default_expression = ast_col_decl.default_expression;
}
if (ast_col_decl.codec)
command.codec = compression_codec_factory.get(ast_col_decl.codec);
if (ast_col_decl.comment)
{
const auto & ast_comment = typeid_cast<ASTLiteral &>(*ast_col_decl.comment);
@ -168,6 +176,9 @@ void AlterCommand::apply(ColumnsDescription & columns_description, ASTPtr & orde
if (default_expression)
columns_description.defaults.emplace(column_name, ColumnDefault{default_kind, default_expression});
if (codec)
columns_description.codecs.emplace(column_name, codec);
/// Slow, because each time a list is copied
columns_description.ordinary = Nested::flatten(columns_description.ordinary);
}
@ -200,6 +211,9 @@ void AlterCommand::apply(ColumnsDescription & columns_description, ASTPtr & orde
}
else if (type == MODIFY_COLUMN)
{
if (codec)
columns_description.codecs[column_name] = codec;
if (!is_mutable())
{
auto & comments = columns_description.comments;

View File

@ -55,6 +55,9 @@ struct AlterCommand
/// indicates that this command should not be applied, for example in case of if_exists=true and column doesn't exist.
bool ignore = false;
/// For ADD and MODIFY
CompressionCodecPtr codec;
AlterCommand() = default;
AlterCommand(const Type type, const String & column_name, const DataTypePtr & data_type,
const ColumnDefaultKind default_kind, const ASTPtr & default_expression,

View File

@ -219,6 +219,8 @@ public:
/// If commit() was not called, deletes temporary files, canceling the ALTER.
~AlterDataPartTransaction();
const String & getPartName() const { return data_part->name; }
/// Review the changes before the commit.
const NamesAndTypesList & getNewColumns() const { return new_columns; }
const DataPart::Checksums & getNewChecksums() const { return new_checksums; }

View File

@ -312,7 +312,11 @@ String MergeTreeDataPartChecksums::getTotalChecksumHex() const
void MinimalisticDataPartChecksums::serialize(WriteBuffer & to) const
{
writeString("checksums format version: 5\n", to);
serializeWithoutHeader(to);
}
void MinimalisticDataPartChecksums::serializeWithoutHeader(WriteBuffer & to) const
{
writeVarUInt(num_compressed_files, to);
writeVarUInt(num_uncompressed_files, to);
@ -337,26 +341,31 @@ bool MinimalisticDataPartChecksums::deserialize(ReadBuffer & in)
if (format_version < MINIMAL_VERSION_WITH_MINIMALISTIC_CHECKSUMS)
{
auto full_checksums_ptr = std::make_unique<MergeTreeDataPartChecksums>();
if (!full_checksums_ptr->read(in, format_version))
MergeTreeDataPartChecksums new_full_checksums;
if (!new_full_checksums.read(in, format_version))
return false;
computeTotalChecksums(*full_checksums_ptr);
full_checksums = std::move(full_checksums_ptr);
computeTotalChecksums(new_full_checksums);
full_checksums = std::move(new_full_checksums);
return true;
}
if (format_version > MINIMAL_VERSION_WITH_MINIMALISTIC_CHECKSUMS)
throw Exception("Unknown checksums format version: " + DB::toString(format_version), ErrorCodes::UNKNOWN_FORMAT);
deserializeWithoutHeader(in);
return true;
}
void MinimalisticDataPartChecksums::deserializeWithoutHeader(ReadBuffer & in)
{
readVarUInt(num_compressed_files, in);
readVarUInt(num_uncompressed_files, in);
readPODBinary(hash_of_all_files, in);
readPODBinary(hash_of_uncompressed_files, in);
readPODBinary(uncompressed_hash_of_compressed_files, in);
return true;
}
void MinimalisticDataPartChecksums::computeTotalChecksums(const MergeTreeDataPartChecksums & full_checksums_)
@ -410,7 +419,7 @@ String MinimalisticDataPartChecksums::getSerializedString(const MergeTreeDataPar
return checksums.getSerializedString();
}
void MinimalisticDataPartChecksums::checkEqual(const MinimalisticDataPartChecksums & rhs, bool check_uncompressed_hash_in_compressed_files)
void MinimalisticDataPartChecksums::checkEqual(const MinimalisticDataPartChecksums & rhs, bool check_uncompressed_hash_in_compressed_files) const
{
if (full_checksums && rhs.full_checksums)
full_checksums->checkEqual(*rhs.full_checksums, check_uncompressed_hash_in_compressed_files);
@ -419,7 +428,7 @@ void MinimalisticDataPartChecksums::checkEqual(const MinimalisticDataPartChecksu
checkEqualImpl(rhs, check_uncompressed_hash_in_compressed_files);
}
void MinimalisticDataPartChecksums::checkEqual(const MergeTreeDataPartChecksums & rhs, bool check_uncompressed_hash_in_compressed_files)
void MinimalisticDataPartChecksums::checkEqual(const MergeTreeDataPartChecksums & rhs, bool check_uncompressed_hash_in_compressed_files) const
{
if (full_checksums)
full_checksums->checkEqual(rhs, check_uncompressed_hash_in_compressed_files);
@ -430,7 +439,7 @@ void MinimalisticDataPartChecksums::checkEqual(const MergeTreeDataPartChecksums
checkEqualImpl(rhs_minimalistic, check_uncompressed_hash_in_compressed_files);
}
void MinimalisticDataPartChecksums::checkEqualImpl(const MinimalisticDataPartChecksums & rhs, bool check_uncompressed_hash_in_compressed_files)
void MinimalisticDataPartChecksums::checkEqualImpl(const MinimalisticDataPartChecksums & rhs, bool check_uncompressed_hash_in_compressed_files) const
{
if (num_compressed_files != rhs.num_compressed_files || num_uncompressed_files != rhs.num_uncompressed_files)
{

View File

@ -4,7 +4,7 @@
#include <IO/WriteBuffer.h>
#include <city.h>
#include <map>
#include <memory>
#include <optional>
class SipHash;
@ -112,7 +112,7 @@ struct MinimalisticDataPartChecksums
}
/// Is set only for old formats
std::unique_ptr<MergeTreeDataPartChecksums> full_checksums;
std::optional<MergeTreeDataPartChecksums> full_checksums;
static constexpr size_t MINIMAL_VERSION_WITH_MINIMALISTIC_CHECKSUMS = 5;
@ -120,15 +120,17 @@ struct MinimalisticDataPartChecksums
void computeTotalChecksums(const MergeTreeDataPartChecksums & full_checksums);
bool deserialize(ReadBuffer & in);
void deserializeWithoutHeader(ReadBuffer & in);
static MinimalisticDataPartChecksums deserializeFrom(const String & s);
void serialize(WriteBuffer & to) const;
void serializeWithoutHeader(WriteBuffer & to) const;
String getSerializedString();
static String getSerializedString(const MergeTreeDataPartChecksums & full_checksums, bool minimalistic);
void checkEqual(const MinimalisticDataPartChecksums & rhs, bool check_uncompressed_hash_in_compressed_files);
void checkEqual(const MergeTreeDataPartChecksums & rhs, bool check_uncompressed_hash_in_compressed_files);
void checkEqualImpl(const MinimalisticDataPartChecksums & rhs, bool check_uncompressed_hash_in_compressed_files);
void checkEqual(const MinimalisticDataPartChecksums & rhs, bool check_uncompressed_hash_in_compressed_files) const;
void checkEqual(const MergeTreeDataPartChecksums & rhs, bool check_uncompressed_hash_in_compressed_files) const;
void checkEqualImpl(const MinimalisticDataPartChecksums & rhs, bool check_uncompressed_hash_in_compressed_files) const;
};

View File

@ -148,6 +148,13 @@ struct MergeTreeSettings
*/ \
M(SettingBool, use_minimalistic_checksums_in_zookeeper, true) \
\
/** Store part header (checksums and columns) in a compact format and a single part znode \
* instead of separate znodes (<part>/columns and <part>/checksums). \
* This can dramatically reduce snapshot size in ZooKeeper. \
* Before enabling check that all replicas support new format. \
*/ \
M(SettingBool, use_minimalistic_part_header_in_zookeeper, false) \
\
/** How many records about mutations that are done to keep. \
* If zero, then keep all of them */ \
M(SettingUInt64, finished_mutations_to_keep, 100) \

View File

@ -1,5 +1,6 @@
#include <Storages/MergeTree/ReplicatedMergeTreeAlterThread.h>
#include <Storages/MergeTree/ReplicatedMergeTreeTableMetadata.h>
#include <Storages/MergeTree/ReplicatedMergeTreePartHeader.h>
#include <Storages/ColumnsDescription.h>
#include <Storages/StorageReplicatedMergeTree.h>
#include <Common/setThreadName.h>
@ -155,32 +156,9 @@ void ReplicatedMergeTreeAlterThread::run()
if (!transaction)
continue;
storage.updatePartHeaderInZooKeeperAndCommit(zookeeper, *transaction);
++changed_parts;
/// Update part metadata in ZooKeeper.
Coordination::Requests ops;
ops.emplace_back(zkutil::makeSetRequest(
storage.replica_path + "/parts/" + part->name + "/columns", transaction->getNewColumns().toString(), -1));
ops.emplace_back(zkutil::makeSetRequest(
storage.replica_path + "/parts/" + part->name + "/checksums",
storage.getChecksumsForZooKeeper(transaction->getNewChecksums()),
-1));
try
{
zookeeper->multi(ops);
}
catch (const Coordination::Exception & e)
{
/// The part does not exist in ZK. We will add to queue for verification - maybe the part is superfluous, and it must be removed locally.
if (e.code == Coordination::ZNONODE)
storage.enqueuePartForCheck(part->name);
throw;
}
/// Apply file changes.
transaction->commit();
}
/// Columns sizes could be quietly changed in case of MODIFY/ADD COLUMN

View File

@ -1,5 +1,6 @@
#include <Storages/MergeTree/ReplicatedMergeTreePartCheckThread.h>
#include <Storages/MergeTree/checkDataPart.h>
#include <Storages/MergeTree/ReplicatedMergeTreePartHeader.h>
#include <Storages/StorageReplicatedMergeTree.h>
#include <Common/setThreadName.h>
@ -204,22 +205,34 @@ void ReplicatedMergeTreePartCheckThread::checkPart(const String & part_name)
auto zookeeper = storage.getZooKeeper();
auto table_lock = storage.lockStructure(false);
auto local_part_header = ReplicatedMergeTreePartHeader::fromColumnsAndChecksums(
part->columns, part->checksums);
String part_path = storage.replica_path + "/parts/" + part_name;
String part_znode;
/// If the part is in ZooKeeper, check its data with its checksums, and them with ZooKeeper.
if (zookeeper->exists(storage.replica_path + "/parts/" + part_name))
if (zookeeper->tryGet(part_path, part_znode))
{
LOG_WARNING(log, "Checking data of part " << part_name << ".");
try
{
auto zk_checksums = MinimalisticDataPartChecksums::deserializeFrom(
zookeeper->get(storage.replica_path + "/parts/" + part_name + "/checksums"));
zk_checksums.checkEqual(part->checksums, true);
ReplicatedMergeTreePartHeader zk_part_header;
if (!part_znode.empty())
zk_part_header = ReplicatedMergeTreePartHeader::fromString(part_znode);
else
{
String columns_znode = zookeeper->get(part_path + "/columns");
String checksums_znode = zookeeper->get(part_path + "/checksums");
zk_part_header = ReplicatedMergeTreePartHeader::fromColumnsAndChecksumsZNodes(
columns_znode, checksums_znode);
}
auto zk_columns = NamesAndTypesList::parse(
zookeeper->get(storage.replica_path + "/parts/" + part_name + "/columns"));
if (part->columns != zk_columns)
if (local_part_header.getColumnsHash() != zk_part_header.getColumnsHash())
throw Exception("Columns of local part " + part_name + " are different from ZooKeeper", ErrorCodes::TABLE_DIFFERS_TOO_MUCH);
zk_part_header.getChecksums().checkEqual(local_part_header.getChecksums(), true);
checkDataPart(
storage.data.getFullPath() + part_name,
storage.data.index_granularity,

View File

@ -0,0 +1,66 @@
#include <Storages/MergeTree/ReplicatedMergeTreePartHeader.h>
#include <Core/NamesAndTypes.h>
#include <IO/WriteHelpers.h>
#include <IO/ReadBufferFromString.h>
#include <Common/SipHash.h>
#include <Common/StringUtils/StringUtils.h>
namespace DB
{
static std::array<char, 16> getSipHash(const String & str)
{
SipHash hash;
hash.update(str.data(), str.size());
std::array<char, 16> result;
hash.get128(result.data());
return result;
}
ReplicatedMergeTreePartHeader ReplicatedMergeTreePartHeader::fromColumnsAndChecksumsZNodes(
const String & columns_znode, const String & checksums_znode)
{
auto columns_hash = getSipHash(columns_znode);
auto checksums = MinimalisticDataPartChecksums::deserializeFrom(checksums_znode);
return ReplicatedMergeTreePartHeader(std::move(columns_hash), std::move(checksums));
}
ReplicatedMergeTreePartHeader ReplicatedMergeTreePartHeader::fromColumnsAndChecksums(
const NamesAndTypesList & columns,
const MergeTreeDataPartChecksums & full_checksums)
{
MinimalisticDataPartChecksums checksums;
checksums.computeTotalChecksums(full_checksums);
return ReplicatedMergeTreePartHeader(getSipHash(columns.toString()), std::move(checksums));
}
void ReplicatedMergeTreePartHeader::read(ReadBuffer & in)
{
in >> "part header format version: 1\n";
in.readStrict(columns_hash.data(), columns_hash.size());
checksums.deserializeWithoutHeader(in);
}
ReplicatedMergeTreePartHeader ReplicatedMergeTreePartHeader::fromString(const String & str)
{
ReadBufferFromString in(str);
ReplicatedMergeTreePartHeader result;
result.read(in);
return result;
}
void ReplicatedMergeTreePartHeader::write(WriteBuffer & out) const
{
writeString("part header format version: 1\n", out);
out.write(columns_hash.data(), columns_hash.size());
checksums.serializeWithoutHeader(out);
}
String ReplicatedMergeTreePartHeader::toString() const
{
WriteBufferFromOwnString out;
write(out);
return out.str();
}
}

View File

@ -0,0 +1,50 @@
#pragma once
#include <Storages/MergeTree/MergeTreeDataPartChecksum.h>
#include <Core/Types.h>
#include <IO/WriteBuffer.h>
#include <IO/ReadBuffer.h>
#include <IO/Operators.h>
#include <array>
namespace DB
{
class NamesAndTypesList;
/// This class provides a compact representation of part metadata (available columns and checksums)
/// that is intended to be stored in the part znode in ZooKeeper.
/// It can also be initialized from the legacy format (from the contents of separate <part>/columns
/// and <part>/checksums znodes).
class ReplicatedMergeTreePartHeader
{
public:
ReplicatedMergeTreePartHeader() = default;
static ReplicatedMergeTreePartHeader fromColumnsAndChecksumsZNodes(
const String & columns_znode, const String & checksums_znode);
static ReplicatedMergeTreePartHeader fromColumnsAndChecksums(
const NamesAndTypesList & columns, const MergeTreeDataPartChecksums & full_checksums);
void read(ReadBuffer & in);
static ReplicatedMergeTreePartHeader fromString(const String & str);
void write(WriteBuffer & out) const;
String toString() const;
const std::array<char, 16> & getColumnsHash() const { return columns_hash; }
const MinimalisticDataPartChecksums & getChecksums() const { return checksums; }
private:
ReplicatedMergeTreePartHeader(std::array<char, 16> columns_hash_, MinimalisticDataPartChecksums checksums_)
: columns_hash(std::move(columns_hash_)), checksums(std::move(checksums_))
{
}
std::array<char, 16> columns_hash;
MinimalisticDataPartChecksums checksums;
};
}

View File

@ -30,7 +30,7 @@ namespace ErrorCodes
static void extractDependentTable(ASTSelectQuery & query, String & select_database_name, String & select_table_name)
{
auto db_and_table = getDatabaseAndTable(query, 0);
ASTPtr subquery = getTableFunctionOrSubquery(query, 0);
ASTPtr subquery = extractTableExpression(query, 0);
if (!db_and_table && !subquery)
return;
@ -69,7 +69,7 @@ static void checkAllowedQueries(const ASTSelectQuery & query)
if (query.prewhere_expression || query.final() || query.sample_size())
throw Exception("MATERIALIZED VIEW cannot have PREWHERE, SAMPLE or FINAL.", DB::ErrorCodes::QUERY_IS_NOT_SUPPORTED_IN_MATERIALIZED_VIEW);
ASTPtr subquery = getTableFunctionOrSubquery(query, 0);
ASTPtr subquery = extractTableExpression(query, 0);
if (!subquery)
return;

View File

@ -8,13 +8,14 @@
#include <Storages/ColumnsDescription.h>
#include <Storages/StorageReplicatedMergeTree.h>
#include <Storages/MergeTree/MergeTreeDataPart.h>
#include <Storages/MergeTree/MergeList.h>
#include <Storages/MergeTree/ReplicatedMergeTreeTableMetadata.h>
#include <Storages/MergeTree/ReplicatedMergeTreeBlockOutputStream.h>
#include <Storages/MergeTree/ReplicatedMergeTreeQuorumEntry.h>
#include <Storages/MergeTree/ReplicatedMergeTreeMutationEntry.h>
#include <Storages/MergeTree/MergeList.h>
#include <Storages/MergeTree/ReplicatedMergeTreeAddress.h>
#include <Storages/MergeTree/ReplicatedMergeTreeQuorumAddedParts.h>
#include <Storages/MergeTree/ReplicatedMergeTreePartHeader.h>
#include <Databases/IDatabase.h>
@ -566,10 +567,7 @@ void StorageReplicatedMergeTree::checkParts(bool skip_sanity_checks)
}
}
else
{
LOG_ERROR(log, "Fetching missing part " << missing_name);
parts_to_fetch.push_back(missing_name);
}
}
for (const String & name : parts_to_fetch)
@ -671,25 +669,49 @@ void StorageReplicatedMergeTree::checkParts(bool skip_sanity_checks)
removePartsFromZooKeeper(zookeeper, Strings(expected_parts.begin(), expected_parts.end()));
}
/// Add to the queue job to pick up the missing parts from other replicas and remove from ZK the information that we have them.
for (const String & name : parts_to_fetch)
/// Add to the queue jobs to pick up the missing parts from other replicas and remove from ZK the information that we have them.
std::vector<std::future<Coordination::ExistsResponse>> exists_futures;
exists_futures.reserve(parts_to_fetch.size());
for (const String & part_name : parts_to_fetch)
{
LOG_ERROR(log, "Removing missing part from ZooKeeper and queueing a fetch: " << name);
String part_path = replica_path + "/parts/" + part_name;
exists_futures.emplace_back(zookeeper->asyncExists(part_path));
}
std::vector<std::future<Coordination::MultiResponse>> enqueue_futures;
enqueue_futures.reserve(parts_to_fetch.size());
for (size_t i = 0; i < parts_to_fetch.size(); ++i)
{
const String & part_name = parts_to_fetch[i];
LOG_ERROR(log, "Removing locally missing part from ZooKeeper and queueing a fetch: " << part_name);
Coordination::Requests ops;
time_t part_create_time = 0;
Coordination::ExistsResponse exists_resp = exists_futures[i].get();
if (!exists_resp.error)
{
part_create_time = exists_resp.stat.ctime / 1000;
removePartFromZooKeeper(part_name, ops, exists_resp.stat.numChildren > 0);
}
LogEntry log_entry;
log_entry.type = LogEntry::GET_PART;
log_entry.source_replica = "";
log_entry.new_part_name = name;
log_entry.create_time = tryGetPartCreateTime(zookeeper, replica_path, name);
log_entry.new_part_name = part_name;
log_entry.create_time = part_create_time;
/// We assume that this occurs before the queue is loaded (queue.initialize).
Coordination::Requests ops;
removePartFromZooKeeper(name, ops);
ops.emplace_back(zkutil::makeCreateRequest(
replica_path + "/queue/queue-", log_entry.toString(), zkutil::CreateMode::PersistentSequential));
zookeeper->multi(ops);
enqueue_futures.emplace_back(zookeeper->asyncMulti(ops));
}
for (auto & future : enqueue_futures)
future.get();
/// Remove extra local parts.
for (const MergeTreeData::DataPartPtr & part : unexpected_parts)
{
@ -708,18 +730,19 @@ void StorageReplicatedMergeTree::checkPartChecksumsAndAddCommitOps(const zkutil:
check(part->columns);
int expected_columns_version = columns_version;
auto local_part_header = ReplicatedMergeTreePartHeader::fromColumnsAndChecksums(
part->columns, part->checksums);
Strings replicas = zookeeper->getChildren(zookeeper_path + "/replicas");
std::shuffle(replicas.begin(), replicas.end(), rng);
String expected_columns_str = part->columns.toString();
bool has_been_alredy_added = false;
bool has_been_already_added = false;
for (const String & replica : replicas)
{
Coordination::Stat stat_before, stat_after;
String current_part_path = zookeeper_path + "/replicas/" + replica + "/parts/" + part_name;
String columns_str;
if (!zookeeper->tryGet(current_part_path + "/columns", columns_str, &stat_before))
String part_zk_str;
if (!zookeeper->tryGet(current_part_path, part_zk_str))
{
if (absent_replicas_paths)
absent_replicas_paths->emplace(current_part_path);
@ -727,30 +750,41 @@ void StorageReplicatedMergeTree::checkPartChecksumsAndAddCommitOps(const zkutil:
continue;
}
if (columns_str != expected_columns_str)
ReplicatedMergeTreePartHeader replica_part_header;
if (!part_zk_str.empty())
replica_part_header = ReplicatedMergeTreePartHeader::fromString(part_zk_str);
else
{
Coordination::Stat columns_stat_before, columns_stat_after;
String columns_str;
String checksums_str;
/// Let's check that the node's version with the columns did not change while we were reading the checksums.
/// This ensures that the columns and the checksum refer to the same data.
if (!zookeeper->tryGet(current_part_path + "/columns", columns_str, &columns_stat_before) ||
!zookeeper->tryGet(current_part_path + "/checksums", checksums_str) ||
!zookeeper->exists(current_part_path + "/columns", &columns_stat_after) ||
columns_stat_before.version != columns_stat_after.version)
{
LOG_INFO(log, "Not checking checksums of part " << part_name << " with replica " << replica
<< " because part changed while we were reading its checksums");
continue;
}
replica_part_header = ReplicatedMergeTreePartHeader::fromColumnsAndChecksumsZNodes(
columns_str, checksums_str);
}
if (replica_part_header.getColumnsHash() != local_part_header.getColumnsHash())
{
LOG_INFO(log, "Not checking checksums of part " << part_name << " with replica " << replica
<< " because columns are different");
continue;
}
String checksums_str;
/// Let's check that the node's version with the columns did not change while we were reading the checksums.
/// This ensures that the columns and the checksum refer to the same data.
if (!zookeeper->tryGet(current_part_path + "/checksums", checksums_str) ||
!zookeeper->exists(current_part_path + "/columns", &stat_after) ||
stat_before.version != stat_after.version)
{
LOG_INFO(log, "Not checking checksums of part " << part_name << " with replica " << replica
<< " because part changed while we were reading its checksums");
continue;
}
auto zk_checksums = MinimalisticDataPartChecksums::deserializeFrom(checksums_str);
zk_checksums.checkEqual(part->checksums, true);
replica_part_header.getChecksums().checkEqual(local_part_header.getChecksums(), true);
if (replica == replica_name)
has_been_alredy_added = true;
has_been_already_added = true;
/// If we verify checksums in "sequential manner" (i.e. recheck absence of checksums on other replicas when commit)
/// then it is enough to verify checksums on at least one replica since checksums on other replicas must be the same.
@ -761,18 +795,27 @@ void StorageReplicatedMergeTree::checkPartChecksumsAndAddCommitOps(const zkutil:
}
}
if (!has_been_alredy_added)
if (!has_been_already_added)
{
String part_path = replica_path + "/parts/" + part_name;
ops.emplace_back(zkutil::makeCheckRequest(
zookeeper_path + "/columns", expected_columns_version));
ops.emplace_back(zkutil::makeCreateRequest(
part_path, "", zkutil::CreateMode::Persistent));
ops.emplace_back(zkutil::makeCreateRequest(
part_path + "/columns", part->columns.toString(), zkutil::CreateMode::Persistent));
ops.emplace_back(zkutil::makeCreateRequest(
part_path + "/checksums", getChecksumsForZooKeeper(part->checksums), zkutil::CreateMode::Persistent));
if (data.settings.use_minimalistic_part_header_in_zookeeper)
{
ops.emplace_back(zkutil::makeCreateRequest(
part_path, local_part_header.toString(), zkutil::CreateMode::Persistent));
}
else
{
ops.emplace_back(zkutil::makeCreateRequest(
part_path, "", zkutil::CreateMode::Persistent));
ops.emplace_back(zkutil::makeCreateRequest(
part_path + "/columns", part->columns.toString(), zkutil::CreateMode::Persistent));
ops.emplace_back(zkutil::makeCreateRequest(
part_path + "/checksums", getChecksumsForZooKeeper(part->checksums), zkutil::CreateMode::Persistent));
}
}
else
{
@ -1510,16 +1553,8 @@ void StorageReplicatedMergeTree::executeClearColumnInPartition(const LogEntry &
if (!transaction)
continue;
/// Update part metadata in ZooKeeper.
Coordination::Requests ops;
ops.emplace_back(zkutil::makeSetRequest(
replica_path + "/parts/" + part->name + "/columns", transaction->getNewColumns().toString(), -1));
ops.emplace_back(zkutil::makeSetRequest(
replica_path + "/parts/" + part->name + "/checksums", getChecksumsForZooKeeper(transaction->getNewChecksums()), -1));
updatePartHeaderInZooKeeperAndCommit(zookeeper, *transaction);
zookeeper->multi(ops);
transaction->commit();
++modified_parts;
}
@ -2322,12 +2357,15 @@ bool StorageReplicatedMergeTree::createLogEntryToMutatePart(const MergeTreeDataP
}
void StorageReplicatedMergeTree::removePartFromZooKeeper(const String & part_name, Coordination::Requests & ops)
void StorageReplicatedMergeTree::removePartFromZooKeeper(const String & part_name, Coordination::Requests & ops, bool has_children)
{
String part_path = replica_path + "/parts/" + part_name;
ops.emplace_back(zkutil::makeRemoveRequest(part_path + "/checksums", -1));
ops.emplace_back(zkutil::makeRemoveRequest(part_path + "/columns", -1));
if (has_children)
{
ops.emplace_back(zkutil::makeRemoveRequest(part_path + "/checksums", -1));
ops.emplace_back(zkutil::makeRemoveRequest(part_path + "/columns", -1));
}
ops.emplace_back(zkutil::makeRemoveRequest(part_path, -1));
}
@ -2338,19 +2376,26 @@ void StorageReplicatedMergeTree::removePartAndEnqueueFetch(const String & part_n
String part_path = replica_path + "/parts/" + part_name;
Coordination::Requests ops;
time_t part_create_time = 0;
Coordination::Stat stat;
if (zookeeper->exists(part_path, &stat))
{
part_create_time = stat.ctime / 1000;
removePartFromZooKeeper(part_name, ops, stat.numChildren > 0);
}
LogEntryPtr log_entry = std::make_shared<LogEntry>();
log_entry->type = LogEntry::GET_PART;
log_entry->create_time = tryGetPartCreateTime(zookeeper, replica_path, part_name);
log_entry->create_time = part_create_time;
log_entry->source_replica = "";
log_entry->new_part_name = part_name;
Coordination::Requests ops;
ops.emplace_back(zkutil::makeCreateRequest(
replica_path + "/queue/queue-", log_entry->toString(),
zkutil::CreateMode::PersistentSequential));
removePartFromZooKeeper(part_name, ops);
auto results = zookeeper->multi(ops);
String path_created = dynamic_cast<const Coordination::CreateResponse &>(*results[0]).path_created;
@ -2691,8 +2736,18 @@ bool StorageReplicatedMergeTree::fetchPart(const String & part_name, const Strin
MinimalisticDataPartChecksums source_part_checksums;
source_part_checksums.computeTotalChecksums(source_part->checksums);
String desired_checksums_str = getZooKeeper()->get(source_replica_path + "/parts/" + part_name + "/checksums");
auto desired_checksums = MinimalisticDataPartChecksums::deserializeFrom(desired_checksums_str);
MinimalisticDataPartChecksums desired_checksums;
auto zookeeper = getZooKeeper();
String part_path = source_replica_path + "/parts/" + part_name;
String part_znode = zookeeper->get(part_path);
if (!part_znode.empty())
desired_checksums = ReplicatedMergeTreePartHeader::fromString(part_znode).getChecksums();
else
{
String desired_checksums_str = zookeeper->get(part_path + "/checksums");
desired_checksums = MinimalisticDataPartChecksums::deserializeFrom(desired_checksums_str);
}
if (source_part_checksums == desired_checksums)
{
LOG_TRACE(log, "Found local part " << source_part->name << " with the same checksums as " << part_name);
@ -4450,32 +4505,40 @@ bool StorageReplicatedMergeTree::tryRemovePartsFromZooKeeperWithRetries(MergeTre
bool StorageReplicatedMergeTree::tryRemovePartsFromZooKeeperWithRetries(const Strings & part_names, size_t max_retries)
{
using MultiFuture = std::future<Coordination::MultiResponse>;
size_t num_tries = 0;
bool sucess = false;
bool success = false;
while (!sucess && (max_retries == 0 || num_tries < max_retries))
while (!success && (max_retries == 0 || num_tries < max_retries))
{
std::vector<MultiFuture> futures;
futures.reserve(part_names.size());
++num_tries;
sucess = true;
try
{
++num_tries;
success = true;
auto zookeeper = getZooKeeper();
std::vector<std::future<Coordination::ExistsResponse>> exists_futures;
exists_futures.reserve(part_names.size());
for (const String & part_name : part_names)
{
Coordination::Requests ops;
removePartFromZooKeeper(part_name, ops);
futures.emplace_back(zookeeper->tryAsyncMulti(ops));
String part_path = replica_path + "/parts/" + part_name;
exists_futures.emplace_back(zookeeper->asyncExists(part_path));
}
for (auto & future : futures)
std::vector<std::future<Coordination::MultiResponse>> remove_futures;
remove_futures.reserve(part_names.size());
for (size_t i = 0; i < part_names.size(); ++i)
{
Coordination::ExistsResponse exists_resp = exists_futures[i].get();
if (!exists_resp.error)
{
Coordination::Requests ops;
removePartFromZooKeeper(part_names[i], ops, exists_resp.stat.numChildren > 0);
remove_futures.emplace_back(zookeeper->tryAsyncMulti(ops));
}
}
for (auto & future : remove_futures)
{
auto response = future.get();
@ -4484,7 +4547,7 @@ bool StorageReplicatedMergeTree::tryRemovePartsFromZooKeeperWithRetries(const St
if (Coordination::isHardwareError(response.error))
{
sucess = false;
success = false;
continue;
}
@ -4493,7 +4556,7 @@ bool StorageReplicatedMergeTree::tryRemovePartsFromZooKeeperWithRetries(const St
}
catch (Coordination::Exception & e)
{
sucess = false;
success = false;
if (Coordination::isHardwareError(e.code))
tryLogCurrentException(log, __PRETTY_FUNCTION__);
@ -4501,70 +4564,79 @@ bool StorageReplicatedMergeTree::tryRemovePartsFromZooKeeperWithRetries(const St
throw;
}
if (!sucess && num_tries < max_retries)
if (!success && num_tries < max_retries)
std::this_thread::sleep_for(std::chrono::milliseconds(1000));
}
return sucess;
return success;
}
/// TODO: rewrite this code using async Multi ops after final ZooKeeper library update
void StorageReplicatedMergeTree::removePartsFromZooKeeper(zkutil::ZooKeeperPtr & zookeeper, const Strings & part_names,
NameSet * parts_should_be_retried)
void StorageReplicatedMergeTree::removePartsFromZooKeeper(
zkutil::ZooKeeperPtr & zookeeper, const Strings & part_names, NameSet * parts_should_be_retried)
{
Coordination::Requests ops;
auto it_first_node_in_batch = part_names.cbegin();
for (auto it = part_names.cbegin(); it != part_names.cend(); ++it)
std::vector<std::future<Coordination::ExistsResponse>> exists_futures;
exists_futures.reserve(part_names.size());
for (const String & part_name : part_names)
{
removePartFromZooKeeper(*it, ops);
String part_path = replica_path + "/parts/" + part_name;
exists_futures.emplace_back(zookeeper->asyncExists(part_path));
}
auto it_next = std::next(it);
if (ops.size() >= zkutil::MULTI_BATCH_SIZE || it_next == part_names.cend())
std::vector<std::future<Coordination::MultiResponse>> remove_futures;
remove_futures.reserve(part_names.size());
try
{
for (size_t i = 0; i < part_names.size(); ++i)
{
Coordination::Responses unused_responses;
auto code = zookeeper->tryMultiNoThrow(ops, unused_responses);
ops.clear();
if (code == Coordination::ZNONODE)
Coordination::ExistsResponse exists_resp = exists_futures[i].get();
if (!exists_resp.error)
{
/// Fallback
LOG_DEBUG(log, "ZooKeeper nodes for some parts in the batch are missing, will remove part nodes one by one");
for (auto it_in_batch = it_first_node_in_batch; it_in_batch != it_next; ++it_in_batch)
{
Coordination::Requests cur_ops;
removePartFromZooKeeper(*it_in_batch, cur_ops);
auto cur_code = zookeeper->tryMultiNoThrow(cur_ops, unused_responses);
if (cur_code == Coordination::ZNONODE)
{
LOG_DEBUG(log, "There is no part " << *it_in_batch << " in ZooKeeper, it was only in filesystem");
}
else if (parts_should_be_retried && Coordination::isHardwareError(cur_code))
{
parts_should_be_retried->emplace(*it_in_batch);
}
else if (cur_code)
{
LOG_WARNING(log, "Cannot remove part " << *it_in_batch << " from ZooKeeper: " << zkutil::ZooKeeper::error2string(cur_code));
}
}
Coordination::Requests ops;
removePartFromZooKeeper(part_names[i], ops, exists_resp.stat.numChildren > 0);
remove_futures.emplace_back(zookeeper->tryAsyncMulti(ops));
}
else if (parts_should_be_retried && Coordination::isHardwareError(code))
else
{
for (auto it_in_batch = it_first_node_in_batch; it_in_batch != it_next; ++it_in_batch)
parts_should_be_retried->emplace(*it_in_batch);
LOG_DEBUG(log,
"There is no part " << part_names[i] << " in ZooKeeper, it was only in filesystem");
// emplace invalid future so that the total number of futures is the same as part_names.size();
remove_futures.emplace_back();
}
else if (code)
{
LOG_WARNING(log, "There was a problem with deleting " << (it_next - it_first_node_in_batch)
<< " nodes from ZooKeeper: " << ::zkutil::ZooKeeper::error2string(code));
}
it_first_node_in_batch = it_next;
}
}
catch (const Coordination::Exception & e)
{
if (parts_should_be_retried && Coordination::isHardwareError(e.code))
parts_should_be_retried->insert(part_names.begin(), part_names.end());
throw;
}
for (size_t i = 0; i < remove_futures.size(); ++i)
{
auto & future = remove_futures[i];
if (!future.valid())
continue;
auto response = future.get();
if (response.error == Coordination::ZOK)
continue;
else if (response.error == Coordination::ZNONODE)
{
LOG_DEBUG(log,
"There is no part " << part_names[i] << " in ZooKeeper, it was only in filesystem");
continue;
}
else if (Coordination::isHardwareError(response.error))
{
if (parts_should_be_retried)
parts_should_be_retried->insert(part_names[i]);
continue;
}
else
LOG_WARNING(log, "Cannot remove part " << part_names[i] << " from ZooKeeper: "
<< zkutil::ZooKeeper::error2string(response.error));
}
}
@ -4809,18 +4881,81 @@ void StorageReplicatedMergeTree::getCommitPartOps(
ops.emplace_back(zkutil::makeCheckRequest(
zookeeper_path + "/columns",
columns_version));
ops.emplace_back(zkutil::makeCreateRequest(
replica_path + "/parts/" + part->name,
"",
zkutil::CreateMode::Persistent));
ops.emplace_back(zkutil::makeCreateRequest(
replica_path + "/parts/" + part->name + "/columns",
part->columns.toString(),
zkutil::CreateMode::Persistent));
ops.emplace_back(zkutil::makeCreateRequest(
replica_path + "/parts/" + part->name + "/checksums",
getChecksumsForZooKeeper(part->checksums),
zkutil::CreateMode::Persistent));
if (data.settings.use_minimalistic_part_header_in_zookeeper)
{
ops.emplace_back(zkutil::makeCreateRequest(
replica_path + "/parts/" + part->name,
ReplicatedMergeTreePartHeader::fromColumnsAndChecksums(part->columns, part->checksums).toString(),
zkutil::CreateMode::Persistent));
}
else
{
ops.emplace_back(zkutil::makeCreateRequest(
replica_path + "/parts/" + part->name,
"",
zkutil::CreateMode::Persistent));
ops.emplace_back(zkutil::makeCreateRequest(
replica_path + "/parts/" + part->name + "/columns",
part->columns.toString(),
zkutil::CreateMode::Persistent));
ops.emplace_back(zkutil::makeCreateRequest(
replica_path + "/parts/" + part->name + "/checksums",
getChecksumsForZooKeeper(part->checksums),
zkutil::CreateMode::Persistent));
}
}
void StorageReplicatedMergeTree::updatePartHeaderInZooKeeperAndCommit(
const zkutil::ZooKeeperPtr & zookeeper,
MergeTreeData::AlterDataPartTransaction & transaction)
{
String part_path = replica_path + "/parts/" + transaction.getPartName();
bool need_delete_columns_and_checksums_nodes = false;
try
{
if (data.settings.use_minimalistic_part_header_in_zookeeper)
{
auto part_header = ReplicatedMergeTreePartHeader::fromColumnsAndChecksums(
transaction.getNewColumns(), transaction.getNewChecksums());
Coordination::Stat stat;
zookeeper->set(part_path, part_header.toString(), -1, &stat);
need_delete_columns_and_checksums_nodes = stat.numChildren > 0;
}
else
{
Coordination::Requests ops;
ops.emplace_back(zkutil::makeSetRequest(
part_path, String(), -1));
ops.emplace_back(zkutil::makeSetRequest(
part_path + "/columns", transaction.getNewColumns().toString(), -1));
ops.emplace_back(zkutil::makeSetRequest(
part_path + "/checksums", getChecksumsForZooKeeper(transaction.getNewChecksums()), -1));
zookeeper->multi(ops);
}
}
catch (const Coordination::Exception & e)
{
/// The part does not exist in ZK. We will add to queue for verification - maybe the part is superfluous, and it must be removed locally.
if (e.code == Coordination::ZNONODE)
enqueuePartForCheck(transaction.getPartName());
throw;
}
/// Apply file changes.
transaction.commit();
/// Legacy <part_path>/columns and <part_path>/checksums znodes are not needed anymore and can be deleted.
if (need_delete_columns_and_checksums_nodes)
{
Coordination::Requests ops;
ops.emplace_back(zkutil::makeRemoveRequest(part_path + "/columns", -1));
ops.emplace_back(zkutil::makeRemoveRequest(part_path + "/checksums", -1));
zookeeper->multi(ops);
}
}
ReplicatedMergeTreeAddress StorageReplicatedMergeTree::getReplicatedMergeTreeAddress() const

View File

@ -372,8 +372,14 @@ private:
MergeTreeData::MutableDataPartPtr & part,
const String & block_id_path = "") const;
/// Updates info about part columns and checksums in ZooKeeper and commits transaction if successful.
void updatePartHeaderInZooKeeperAndCommit(
const zkutil::ZooKeeperPtr & zookeeper,
MergeTreeData::AlterDataPartTransaction & transaction);
/// Adds actions to `ops` that remove a part from ZooKeeper.
void removePartFromZooKeeper(const String & part_name, Coordination::Requests & ops);
/// Set has_children to true for "old-style" parts (those with /columns and /checksums child znodes).
void removePartFromZooKeeper(const String & part_name, Coordination::Requests & ops, bool has_children);
/// Quickly removes big set of parts from ZooKeeper (using async multi queries)
void removePartsFromZooKeeper(zkutil::ZooKeeperPtr & zookeeper, const Strings & part_names,

View File

@ -41,6 +41,7 @@ StorageSystemColumns::StorageSystemColumns(const std::string & name_)
{ "is_in_sorting_key", std::make_shared<DataTypeUInt8>() },
{ "is_in_primary_key", std::make_shared<DataTypeUInt8>() },
{ "is_in_sampling_key", std::make_shared<DataTypeUInt8>() },
{ "compression_codec", std::make_shared<DataTypeString>() },
}));
}
@ -86,6 +87,7 @@ protected:
NamesAndTypesList columns;
ColumnDefaults column_defaults;
ColumnComments column_comments;
ColumnCodecs column_codecs;
Names cols_required_for_partition_key;
Names cols_required_for_sorting_key;
Names cols_required_for_primary_key;
@ -114,6 +116,7 @@ protected:
}
columns = storage->getColumns().getAll();
column_codecs = storage->getColumns().codecs;
column_defaults = storage->getColumns().defaults;
column_comments = storage->getColumns().comments;
@ -219,6 +222,20 @@ protected:
res_columns[res_index++]->insert(find_in_vector(cols_required_for_sampling));
}
{
const auto it = column_codecs.find(column.name);
if (it == std::end(column_codecs))
{
if (columns_mask[src_index++])
res_columns[res_index++]->insertDefault();
}
else
{
if (columns_mask[src_index++])
res_columns[res_index++]->insert("CODEC(" + it->second->getCodecDesc() + ")");
}
}
++rows_count;
}
}

View File

@ -24,3 +24,6 @@ target_link_libraries (get_current_inserts_in_replicated PRIVATE dbms clickhouse
add_executable (get_abandonable_lock_in_all_partitions get_abandonable_lock_in_all_partitions.cpp)
target_link_libraries (get_abandonable_lock_in_all_partitions PRIVATE dbms clickhouse_common_config clickhouse_common_zookeeper)
add_executable (transform_part_zk_nodes transform_part_zk_nodes.cpp)
target_link_libraries (transform_part_zk_nodes PRIVATE dbms clickhouse_common_config clickhouse_common_zookeeper)

View File

@ -0,0 +1,130 @@
#include <Storages/MergeTree/ReplicatedMergeTreePartHeader.h>
#include <Storages/MergeTree/MergeTreeDataPartChecksum.h>
#include <IO/WriteBufferFromString.h>
#include <IO/WriteHelpers.h>
#include <Common/Exception.h>
#include <Common/StringUtils/StringUtils.h>
#include <Common/ZooKeeper/ZooKeeper.h>
#include <Common/ZooKeeper/KeeperException.h>
#include <boost/program_options.hpp>
#include <list>
#include <iostream>
int main(int argc, char ** argv)
try
{
boost::program_options::options_description desc("Allowed options");
desc.add_options()
("help,h", "produce help message")
("address,a", boost::program_options::value<std::string>()->required(),
"addresses of ZooKeeper instances, comma separated. Example: example01e.yandex.ru:2181")
("path,p", boost::program_options::value<std::string>()->required(),
"where to start")
;
boost::program_options::variables_map options;
boost::program_options::store(boost::program_options::parse_command_line(argc, argv, desc), options);
if (options.count("help"))
{
std::cout << "Transform contents of part nodes in ZooKeeper to more compact storage scheme." << std::endl;
std::cout << "Usage: " << argv[0] << " [options]" << std::endl;
std::cout << desc << std::endl;
return 1;
}
zkutil::ZooKeeper zookeeper(options.at("address").as<std::string>());
std::string initial_path = options.at("path").as<std::string>();
struct Node
{
Node(
std::string path_,
std::future<Coordination::GetResponse> get_future_,
std::future<Coordination::ListResponse> children_future_,
Node * parent_)
: path(std::move(path_))
, get_future(std::move(get_future_))
, children_future(std::move(children_future_))
, parent(parent_)
{
}
std::string path;
std::future<Coordination::GetResponse> get_future;
std::future<Coordination::ListResponse> children_future;
Node * parent = nullptr;
std::future<Coordination::MultiResponse> set_future;
};
std::list<Node> nodes_queue;
nodes_queue.emplace_back(
initial_path, zookeeper.asyncGet(initial_path), zookeeper.asyncGetChildren(initial_path), nullptr);
for (auto it = nodes_queue.begin(); it != nodes_queue.end(); ++it)
{
Coordination::GetResponse get_response;
Coordination::ListResponse children_response;
try
{
get_response = it->get_future.get();
children_response = it->children_future.get();
}
catch (const Coordination::Exception & e)
{
if (e.code == Coordination::ZNONODE)
continue;
throw;
}
if (get_response.stat.ephemeralOwner)
continue;
if (it->path.find("/parts/") != std::string::npos
&& !endsWith(it->path, "/columns")
&& !endsWith(it->path, "/checksums"))
{
if (!children_response.names.empty())
{
auto part_header = DB::ReplicatedMergeTreePartHeader::fromColumnsAndChecksumsZNodes(
zookeeper.get(it->path + "/columns"), zookeeper.get(it->path + "/checksums"));
Coordination::Requests ops;
ops.emplace_back(zkutil::makeRemoveRequest(it->path + "/columns", -1));
ops.emplace_back(zkutil::makeRemoveRequest(it->path + "/checksums", -1));
ops.emplace_back(zkutil::makeSetRequest(it->path, part_header.toString(), -1));
it->set_future = zookeeper.asyncMulti(ops);
}
}
else
{
for (const auto & name : children_response.names)
{
std::string child_path = it->path == "/" ? it->path + name : it->path + '/' + name;
nodes_queue.emplace_back(
child_path, zookeeper.asyncGet(child_path), zookeeper.asyncGetChildren(child_path),
&(*it));
}
}
}
for (auto it = nodes_queue.begin(); it != nodes_queue.end(); ++it)
{
if (it->set_future.valid())
{
it->set_future.get();
std::cerr << it->path << " changed!" << std::endl;
}
}
}
catch (...)
{
std::cerr << DB::getCurrentExceptionMessage(true) << '\n';
throw;
}

View File

@ -12,6 +12,7 @@
#include <Common/typeid_cast.h>
#include <Common/parseRemoteDescription.h>
#include <TableFunctions/TableFunctionFactory.h>
#include <Core/Defines.h>
namespace DB
@ -152,7 +153,8 @@ StoragePtr TableFunctionRemote::executeImpl(const ASTPtr & ast_function, const C
if (names.empty())
throw Exception("Shard list is empty after parsing first argument", ErrorCodes::BAD_ARGUMENTS);
cluster = std::make_shared<Cluster>(context.getSettings(), names, username, password, context.getTCPPort(), false);
auto maybe_secure_port = context.getTCPPortSecure();
cluster = std::make_shared<Cluster>(context.getSettings(), names, username, password, (secure ? (maybe_secure_port ? *maybe_secure_port : DBMS_DEFAULT_SECURE_PORT) : context.getTCPPort()), false, secure);
}
auto structure_remote_table = getStructureOfRemoteTable(*cluster, remote_database, remote_table, context, remote_table_function_ptr);
@ -177,8 +179,8 @@ StoragePtr TableFunctionRemote::executeImpl(const ASTPtr & ast_function, const C
}
TableFunctionRemote::TableFunctionRemote(const std::string & name_)
: name(name_)
TableFunctionRemote::TableFunctionRemote(const std::string & name_, bool secure)
: name{name_}, secure{secure}
{
is_cluster_function = name == "cluster";
@ -193,6 +195,7 @@ TableFunctionRemote::TableFunctionRemote(const std::string & name_)
void registerTableFunctionRemote(TableFunctionFactory & factory)
{
factory.registerFunction("remote", [] () -> TableFunctionPtr { return std::make_shared<TableFunctionRemote>("remote"); });
factory.registerFunction("remoteSecure", [] () -> TableFunctionPtr { return std::make_shared<TableFunctionRemote>("remote", /* secure = */ true); });
factory.registerFunction("cluster", [] () -> TableFunctionPtr { return std::make_shared<TableFunctionRemote>("cluster"); });
}

View File

@ -16,7 +16,7 @@ namespace DB
class TableFunctionRemote : public ITableFunction
{
public:
explicit TableFunctionRemote(const std::string & name_ = "remote");
explicit TableFunctionRemote(const std::string & name_ = "remote", bool secure = false);
std::string getName() const override { return name; }
@ -26,6 +26,7 @@ private:
std::string name;
bool is_cluster_function;
std::string help_message;
bool secure;
};
}

View File

@ -0,0 +1,46 @@
<test>
<name>Constant column string search</name>
<tags>
<tag>search</tag>
</tags>
<preconditions>
<table_exists>hits_100m_single</table_exists>
</preconditions>
<type>loop</type>
<stop_conditions>
<all_of>
<iterations>5</iterations>
<min_time_not_changing_for_ms>10000</min_time_not_changing_for_ms>
</all_of>
<any_of>
<iterations>50</iterations>
<total_time_ms>60000</total_time_ms>
</any_of>
</stop_conditions>
<query><![CDATA[select count(position(URL, 'yandex')), count(position(URL, 'google')) FROM hits_100m_single]]></query>
<query><![CDATA[select count(multiPosition(URL, ['yandex', 'google'])) FROM hits_100m_single]]></query>
<query><![CDATA[select count(match(URL, 'yandex|google')) FROM hits_100m_single]]></query>
<query><![CDATA[select sum(match(URL, 'yandex')), sum(match(URL, 'google')), sum(match(URL, 'yahoo')), sum(match(URL, 'pikabu')) FROM hits_100m_single]]></query>
<query><![CDATA[select sum(multiSearch(URL, ['yandex', 'google', 'yahoo', 'pikabu'])) from hits_100m_single]]></query>
<query><![CDATA[select sum(match(URL, 'yandex|google|yahoo|pikabu')) FROM hits_100m_single]]></query>
<query><![CDATA[select sum(match(URL, 'yandex')), sum(match(URL, 'google')), sum(match(URL, 'http')) FROM hits_100m_single]]></query>
<query><![CDATA[select sum(multiSearch(URL, ['yandex', 'google', 'http'])) from hits_100m_single]]></query>
<query><![CDATA[select sum(match(URL, 'yandex|google|http')) FROM hits_100m_single]]></query>
<query><![CDATA[select sum(match(URL, 'yandex')), sum(match(URL, 'google')), sum(match(URL, 'facebook')), sum(match(URL, 'wikipedia')), sum(match(URL, 'reddit')) FROM hits_100m_single]]></query>
<query><![CDATA[select sum(multiSearch(URL, ['yandex', 'google', 'facebook', 'wikipedia', 'reddit'])) from hits_100m_single]]></query>
<query><![CDATA[select sum(match(URL, 'yandex|google|facebook|wikipedia|reddit')) FROM hits_100m_single]]></query>
<query><![CDATA[select sum(firstMatch(URL, ['yandex', 'google', 'http', 'facebook', 'google'])) from hits_100m_single]]></query>
<main_metric>
<min_time/>
</main_metric>
</test>

View File

@ -0,0 +1,10 @@
0
0
0
0
0
0
0
0
0
0

View File

@ -0,0 +1,24 @@
#!/usr/bin/env bash
# set -x
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
. $CURDIR/../shell_config.sh
# Not default server config needed
tcp_port_secure=`$CLICKHOUSE_EXTRACT_CONFIG -k tcp_port_secure 2>/dev/null`
if [ -z $tcp_port_secure ]; then
# Secure port disabled. Fake result
cat $CURDIR/00505_shard_secure.reference
else
$CLICKHOUSE_CLIENT -q "SELECT * FROM remoteSecure('127.0.0.{1,2}', system.one);"
$CLICKHOUSE_CLIENT -q "SELECT * FROM remoteSecure('127.0.0.{1,2}:$CLICKHOUSE_PORT_TCP_SECURE', system.one);"
$CLICKHOUSE_CLIENT -q "SELECT * FROM remoteSecure('127.0.0.{1,2}', system.one);"
$CLICKHOUSE_CLIENT -q "SELECT * FROM remoteSecure(test_shard_localhost_secure, system.one);"
$CLICKHOUSE_CLIENT -q "SELECT * FROM remote(test_shard_localhost_secure, system.one);"
$CLICKHOUSE_CLIENT -q "SELECT * FROM remoteSecure(test_shard_localhost, system.one);"
$CLICKHOUSE_CLIENT -q "SELECT * FROM remote(test_shard_localhost, system.one);"
fi

View File

@ -2,11 +2,3 @@ a \N 1
a b 1
a \N 2
a b 1
a \N 1
a b 1
a \N 2
\N \N 2
a b 1
\N \N 1

View File

@ -11,5 +11,4 @@ INSERT INTO test.rollup_having VALUES ('a', 'b');
SELECT a, b, count(*) FROM test.rollup_having GROUP BY a, b WITH ROLLUP HAVING a IS NOT NULL;
SELECT a, b, count(*) FROM test.rollup_having GROUP BY a, b WITH ROLLUP HAVING a IS NOT NULL and b IS NOT NULL;
SELECT a, b, count(*) FROM test.rollup_having GROUP BY a, b WITH ROLLUP WITH TOTALS HAVING a IS NOT NULL;
SELECT a, b, count(*) FROM test.rollup_having GROUP BY a, b WITH ROLLUP WITH TOTALS HAVING a IS NOT NULL and b IS NOT NULL;
DROP TABLE test.rollup_having;

View File

@ -0,0 +1,33 @@
2018-01-01 1
2018-01-01 2
CODEC(ZSTD(1))
2018-01-01 1 default_value
2018-01-01 2 default_value
2018-01-01 3 3
2018-01-01 4 4
CODEC(NONE)
2018-01-01 1 default_value
2018-01-01 2 default_value
2018-01-01 3 3
2018-01-01 4 4
2018-01-01 5 5
2018-01-01 6 6
2018-01-01 1 default_value
2018-01-01 2 default_value
2018-01-01 3 3
2018-01-01 4 4
2018-01-01 5 5
2018-01-01 6 6
CODEC(ZSTD(1), LZ4HC(0), LZ4, LZ4, NONE)
2018-01-01 1 default_value
2018-01-01 2 default_value
2018-01-01 3 3
2018-01-01 4 4
2018-01-01 5 5
2018-01-01 6 6
2018-01-01 7 7
2018-01-01 8 8
CODEC(ZSTD(1), LZ4HC(0), LZ4, LZ4, NONE)
CODEC(NONE, LZ4, LZ4HC(0), ZSTD(1))
2
1

View File

@ -0,0 +1,88 @@
SET send_logs_level = 'none';
DROP TABLE IF EXISTS test.alter_compression_codec;
CREATE TABLE test.alter_compression_codec (
somedate Date CODEC(LZ4),
id UInt64 CODEC(NONE)
) ENGINE = MergeTree() PARTITION BY somedate ORDER BY id;
INSERT INTO test.alter_compression_codec VALUES('2018-01-01', 1);
INSERT INTO test.alter_compression_codec VALUES('2018-01-01', 2);
SELECT * FROM test.alter_compression_codec ORDER BY id;
ALTER TABLE test.alter_compression_codec ADD COLUMN alter_column String DEFAULT 'default_value' CODEC(ZSTD);
SELECT compression_codec FROM system.columns WHERE database = 'test' AND table = 'alter_compression_codec' AND name = 'alter_column';
INSERT INTO test.alter_compression_codec VALUES('2018-01-01', 3, '3');
INSERT INTO test.alter_compression_codec VALUES('2018-01-01', 4, '4');
SELECT * FROM test.alter_compression_codec ORDER BY id;
ALTER TABLE test.alter_compression_codec MODIFY COLUMN alter_column CODEC(NONE);
SELECT compression_codec FROM system.columns WHERE database = 'test' AND table = 'alter_compression_codec' AND name = 'alter_column';
INSERT INTO test.alter_compression_codec VALUES('2018-01-01', 5, '5');
INSERT INTO test.alter_compression_codec VALUES('2018-01-01', 6, '6');
SELECT * FROM test.alter_compression_codec ORDER BY id;
OPTIMIZE TABLE test.alter_compression_codec FINAL;
SELECT * FROM test.alter_compression_codec ORDER BY id;
ALTER TABLE test.alter_compression_codec MODIFY COLUMN alter_column CODEC(ZSTD, LZ4HC, LZ4, LZ4, NONE);
SELECT compression_codec FROM system.columns WHERE database = 'test' AND table = 'alter_compression_codec' AND name = 'alter_column';
INSERT INTO test.alter_compression_codec VALUES('2018-01-01', 7, '7');
INSERT INTO test.alter_compression_codec VALUES('2018-01-01', 8, '8');
OPTIMIZE TABLE test.alter_compression_codec FINAL;
SELECT * FROM test.alter_compression_codec ORDER BY id;
ALTER TABLE test.alter_compression_codec MODIFY COLUMN alter_column FixedString(100);
SELECT compression_codec FROM system.columns WHERE database = 'test' AND table = 'alter_compression_codec' AND name = 'alter_column';
DROP TABLE IF EXISTS test.alter_compression_codec;
DROP TABLE IF EXISTS test.alter_bad_codec;
CREATE TABLE test.alter_bad_codec (
somedate Date CODEC(LZ4),
id UInt64 CODEC(NONE)
) ENGINE = MergeTree() ORDER BY tuple();
ALTER TABLE test.alter_bad_codec ADD COLUMN alter_column DateTime DEFAULT '2019-01-01 00:00:00' CODEC(gbdgkjsdh); -- { serverError 432 }
ALTER TABLE test.alter_bad_codec ADD COLUMN alter_column DateTime DEFAULT '2019-01-01 00:00:00' CODEC(ZSTD(100)); -- { serverError 433 }
DROP TABLE IF EXISTS test.alter_bad_codec;
DROP TABLE IF EXISTS test.large_alter_table;
DROP TABLE IF EXISTS test.store_of_hash;
CREATE TABLE test.large_alter_table (
somedate Date CODEC(ZSTD, ZSTD, ZSTD(12), LZ4HC(12)),
id UInt64 CODEC(LZ4, ZSTD, NONE, LZ4HC),
data String CODEC(ZSTD(2), LZ4HC, NONE, LZ4, LZ4)
) ENGINE = MergeTree() PARTITION BY somedate ORDER BY id SETTINGS index_granularity = 2;
INSERT INTO test.large_alter_table SELECT toDate('2019-01-01'), number, toString(number + rand()) FROM system.numbers LIMIT 300000;
CREATE TABLE test.store_of_hash (hash UInt64) ENGINE = Memory();
INSERT INTO test.store_of_hash SELECT sum(cityHash64(*)) FROM test.large_alter_table;
ALTER TABLE test.large_alter_table MODIFY COLUMN data CODEC(NONE, LZ4, LZ4HC, ZSTD);
OPTIMIZE TABLE test.large_alter_table;
SELECT compression_codec FROM system.columns WHERE database = 'test' AND table = 'large_alter_table' AND name = 'data';
DETACH TABLE test.large_alter_table;
ATTACH TABLE test.large_alter_table;
INSERT INTO test.store_of_hash SELECT sum(cityHash64(*)) FROM test.large_alter_table;
SELECT COUNT(hash) FROM test.store_of_hash;
SELECT COUNT(DISTINCT hash) FROM test.store_of_hash;
DROP TABLE IF EXISTS test.large_alter_table;
DROP TABLE IF EXISTS test.store_of_hash;

View File

@ -1,6 +1,6 @@
1 hello 2018-12-14 1.1 aaa
2 world 2018-12-15 2.2 bbb
3 ! 2018-12-16 3.3 ccc
1 hello 2018-12-14 1.1 aaa 5
2 world 2018-12-15 2.2 bbb 6
3 ! 2018-12-16 3.3 ccc 7
2
1 world 2018-10-05 1.1
2 hello 2018-10-01 2.2
@ -9,8 +9,9 @@
10003
274972506.6
9175437371954010821
CREATE TABLE test.compression_codec_multiple_more_types ( id Decimal(38, 13) CODEC(ZSTD, LZ4, ZSTD, ZSTD, LZ4HC), data FixedString(12) CODEC(ZSTD, ZSTD, NONE, NONE, NONE, LZ4HC), `ddd.age` Array(UInt8) CODEC(LZ4, LZ4HC, NONE, NONE, NONE, ZSTD), `ddd.Name` Array(String) CODEC(LZ4, LZ4HC, NONE, NONE, NONE, ZSTD)) ENGINE = MergeTree() ORDER BY tuple() SETTINGS index_granularity = 8192
CREATE TABLE test.compression_codec_multiple_more_types ( id Decimal(38, 13) CODEC(ZSTD(1), LZ4, ZSTD(1), ZSTD(1), Delta(2), Delta(4), Delta(1), LZ4HC(0)), data FixedString(12) CODEC(ZSTD(1), ZSTD(1), Delta(1), Delta(1), Delta(1), NONE, NONE, NONE, LZ4HC(0)), `ddd.age` Array(UInt8) CODEC(LZ4, LZ4HC(0), NONE, NONE, NONE, ZSTD(1), Delta(8)), `ddd.Name` Array(String) CODEC(LZ4, LZ4HC(0), NONE, NONE, NONE, ZSTD(1), Delta(8))) ENGINE = MergeTree() ORDER BY tuple() SETTINGS index_granularity = 8192
1.5555555555555 hello world! [77] ['John']
7.1000000000000 xxxxxxxxxxxx [127] ['Henry']
!
222
CREATE TABLE test.test_default_delta ( id UInt64 CODEC(Delta(8)), data String CODEC(Delta(1)), somedate Date CODEC(Delta(2)), somenum Float64 CODEC(Delta(8)), somestr FixedString(3) CODEC(Delta(1)), othernum Int64 CODEC(Delta(8)), yetothernum Float32 CODEC(Delta(4)), `ddd.age` Array(UInt8) CODEC(Delta(1)), `ddd.Name` Array(String) CODEC(Delta(1)), `ddd.OName` Array(String) CODEC(Delta(1)), `ddd.BName` Array(String) CODEC(Delta(1))) ENGINE = MergeTree() ORDER BY tuple() SETTINGS index_granularity = 8192

View File

@ -2,17 +2,27 @@ SET send_logs_level = 'none';
DROP TABLE IF EXISTS test.compression_codec;
CREATE TABLE test.compression_codec(id UInt64 CODEC(LZ4), data String CODEC(ZSTD), ddd Date CODEC(NONE), somenum Float64 CODEC(ZSTD(2)), somestr FixedString(3) CODEC(LZ4HC(7))) ENGINE = MergeTree() ORDER BY tuple();
CREATE TABLE test.compression_codec(
id UInt64 CODEC(LZ4),
data String CODEC(ZSTD),
ddd Date CODEC(NONE),
somenum Float64 CODEC(ZSTD(2)),
somestr FixedString(3) CODEC(LZ4HC(7)),
othernum Int64 CODEC(Delta)
) ENGINE = MergeTree() ORDER BY tuple();
INSERT INTO test.compression_codec VALUES(1, 'hello', toDate('2018-12-14'), 1.1, 'aaa');
INSERT INTO test.compression_codec VALUES(2, 'world', toDate('2018-12-15'), 2.2, 'bbb');
INSERT INTO test.compression_codec VALUES(3, '!', toDate('2018-12-16'), 3.3, 'ccc');
INSERT INTO test.compression_codec VALUES(1, 'hello', toDate('2018-12-14'), 1.1, 'aaa', 5);
INSERT INTO test.compression_codec VALUES(2, 'world', toDate('2018-12-15'), 2.2, 'bbb', 6);
INSERT INTO test.compression_codec VALUES(3, '!', toDate('2018-12-16'), 3.3, 'ccc', 7);
SELECT * FROM test.compression_codec ORDER BY id;
OPTIMIZE TABLE test.compression_codec FINAL;
INSERT INTO test.compression_codec VALUES(2, '', toDate('2018-12-13'), 4.4, 'ddd');
INSERT INTO test.compression_codec VALUES(2, '', toDate('2018-12-13'), 4.4, 'ddd', 8);
DETACH TABLE test.compression_codec;
ATTACH TABLE test.compression_codec;
SELECT count(*) FROM test.compression_codec WHERE id = 2 GROUP BY id;
@ -23,28 +33,34 @@ DROP TABLE IF EXISTS test.params_when_no_params;
DROP TABLE IF EXISTS test.too_many_params;
DROP TABLE IF EXISTS test.codec_multiple_direct_specification_1;
DROP TABLE IF EXISTS test.codec_multiple_direct_specification_2;
DROP TABLE IF EXISTS test.delta_bad_params1;
DROP TABLE IF EXISTS test.delta_bad_params2;
CREATE TABLE test.bad_codec(id UInt64 CODEC(adssadads)) ENGINE = MergeTree() order by tuple(); -- { serverError 432 }
CREATE TABLE test.too_many_params(id UInt64 CODEC(ZSTD(2,3,4,5))) ENGINE = MergeTree() order by tuple(); -- { serverError 431 }
CREATE TABLE test.params_when_no_params(id UInt64 CODEC(LZ4(1))) ENGINE = MergeTree() ORDER BY tuple(); -- { serverError 378 }
CREATE TABLE test.codec_multiple_direct_specification_1(id UInt64 CODEC(MULTIPLE(LZ4, ZSTD))) ENGINE = MergeTree() ORDER BY tuple(); -- { serverError 432 }
CREATE TABLE test.codec_multiple_direct_specification_2(id UInt64 CODEC(multiple(LZ4, ZSTD))) ENGINE = MergeTree() ORDER BY tuple(); -- { serverError 432 }
CREATE TABLE test.delta_bad_params1(id UInt64 CODEC(Delta(3))) ENGINE = MergeTree() ORDER BY tuple(); -- { serverError 433 }
CREATE TABLE test.delta_bad_params2(id UInt64 CODEC(Delta(16))) ENGINE = MergeTree() ORDER BY tuple(); -- { serverError 433 }
DROP TABLE IF EXISTS test.bad_codec;
DROP TABLE IF EXISTS test.params_when_no_params;
DROP TABLE IF EXISTS test.too_many_params;
DROP TABLE IF EXISTS test.codec_multiple_direct_specification_1;
DROP TABLE IF EXISTS test.codec_multiple_direct_specification_2;
DROP TABLE IF EXISTS test.delta_bad_params1;
DROP TABLE IF EXISTS test.delta_bad_params2;
DROP TABLE IF EXISTS test.compression_codec_multiple;
SET network_compression_method = 'lz4hc';
CREATE TABLE test.compression_codec_multiple (
id UInt64 CODEC(LZ4, ZSTD, NONE, LZ4HC),
data String CODEC(ZSTD(2), NONE, LZ4HC, LZ4, LZ4),
ddd Date CODEC(NONE, NONE, NONE, LZ4, ZSTD, LZ4HC, LZ4HC),
somenum Float64 CODEC(LZ4, LZ4, ZSTD(2), LZ4HC(5), ZSTD(3), ZSTD)
id UInt64 CODEC(LZ4, ZSTD, NONE, LZ4HC, Delta(4)),
data String CODEC(ZSTD(2), NONE, Delta(2), LZ4HC, LZ4, LZ4, Delta(8)),
ddd Date CODEC(NONE, NONE, NONE, Delta(1), LZ4, ZSTD, LZ4HC, LZ4HC),
somenum Float64 CODEC(Delta(4), LZ4, LZ4, ZSTD(2), LZ4HC(5), ZSTD(3), ZSTD)
) ENGINE = MergeTree() ORDER BY tuple();
INSERT INTO test.compression_codec_multiple VALUES (1, 'world', toDate('2018-10-05'), 1.1), (2, 'hello', toDate('2018-10-01'), 2.2), (3, 'buy', toDate('2018-10-11'), 3.3);
@ -68,9 +84,9 @@ SELECT sum(cityHash64(*)) FROM test.compression_codec_multiple;
DROP TABLE IF EXISTS test.compression_codec_multiple_more_types;
CREATE TABLE test.compression_codec_multiple_more_types (
id Decimal128(13) CODEC(ZSTD, LZ4, ZSTD, ZSTD, LZ4HC),
data FixedString(12) CODEC(ZSTD, ZSTD, NONE, NONE, NONE, LZ4HC),
ddd Nested (age UInt8, Name String) CODEC(LZ4, LZ4HC, NONE, NONE, NONE, ZSTD)
id Decimal128(13) CODEC(ZSTD, LZ4, ZSTD, ZSTD, Delta(2), Delta(4), Delta(1), LZ4HC),
data FixedString(12) CODEC(ZSTD, ZSTD, Delta, Delta, Delta, NONE, NONE, NONE, LZ4HC),
ddd Nested (age UInt8, Name String) CODEC(LZ4, LZ4HC, NONE, NONE, NONE, ZSTD, Delta(8))
) ENGINE = MergeTree() ORDER BY tuple();
SHOW CREATE TABLE test.compression_codec_multiple_more_types;
@ -86,9 +102,9 @@ SET network_compression_method = 'zstd';
SET network_zstd_compression_level = 5;
CREATE TABLE test.compression_codec_multiple_with_key (
somedate Date CODEC(ZSTD, ZSTD, ZSTD(12), LZ4HC(12)),
id UInt64 CODEC(LZ4, ZSTD, NONE, LZ4HC),
data String CODEC(ZSTD(2), LZ4HC, NONE, LZ4, LZ4)
somedate Date CODEC(ZSTD, ZSTD, ZSTD(12), LZ4HC(12), Delta, Delta),
id UInt64 CODEC(LZ4, ZSTD, Delta, NONE, LZ4HC, Delta),
data String CODEC(ZSTD(2), Delta, LZ4HC, NONE, LZ4, LZ4)
) ENGINE = MergeTree() PARTITION BY somedate ORDER BY id SETTINGS index_granularity = 2;
@ -101,3 +117,20 @@ INSERT INTO test.compression_codec_multiple_with_key SELECT toDate('2018-10-12')
SELECT COUNT(DISTINCT data) FROM test.compression_codec_multiple_with_key WHERE id < 222;
DROP TABLE IF EXISTS test.compression_codec_multiple_with_key;
DROP TABLE IF EXISTS test.test_default_delta;
CREATE TABLE test.test_default_delta(
id UInt64 CODEC(Delta),
data String CODEC(Delta),
somedate Date CODEC(Delta),
somenum Float64 CODEC(Delta),
somestr FixedString(3) CODEC(Delta),
othernum Int64 CODEC(Delta),
yetothernum Float32 CODEC(Delta),
ddd Nested (age UInt8, Name String, OName String, BName String) CODEC(Delta)
) ENGINE = MergeTree() ORDER BY tuple();
SHOW CREATE TABLE test.test_default_delta;
DROP TABLE IF EXISTS test.test_default_delta;

View File

@ -0,0 +1,6 @@
91
1
46
1
33
1

View File

@ -0,0 +1,114 @@
SET send_logs_level = 'none';
DROP TABLE IF EXISTS test.delta_codec_synthetic;
DROP TABLE IF EXISTS test.default_codec_synthetic;
CREATE TABLE test.delta_codec_synthetic
(
id UInt64 Codec(Delta, ZSTD)
) ENGINE MergeTree() ORDER BY tuple();
CREATE TABLE test.default_codec_synthetic
(
id UInt64 Codec(ZSTD)
) ENGINE MergeTree() ORDER BY tuple();
INSERT INTO test.delta_codec_synthetic SELECT number FROM system.numbers LIMIT 5000000;
INSERT INTO test.default_codec_synthetic SELECT number FROM system.numbers LIMIT 5000000;
OPTIMIZE TABLE test.delta_codec_synthetic FINAL;
OPTIMIZE TABLE test.default_codec_synthetic FINAL;
SELECT
floor(big_size / small_size) AS ratio
FROM
(SELECT 1 AS key, sum(bytes_on_disk) AS small_size FROM system.parts WHERE database == 'test' and table == 'delta_codec_synthetic')
INNER JOIN
(SELECT 1 AS key, sum(bytes_on_disk) as big_size FROM system.parts WHERE database == 'test' and table == 'default_codec_synthetic')
USING(key);
SELECT
small_hash == big_hash
FROM
(SELECT 1 AS key, sum(cityHash64(*)) AS small_hash FROM test.delta_codec_synthetic)
INNER JOIN
(SELECT 1 AS key, sum(cityHash64(*)) AS big_hash FROM test.default_codec_synthetic)
USING(key);
DROP TABLE IF EXISTS test.delta_codec_synthetic;
DROP TABLE IF EXISTS test.default_codec_synthetic;
DROP TABLE IF EXISTS test.delta_codec_float;
DROP TABLE IF EXISTS test.default_codec_float;
CREATE TABLE test.delta_codec_float
(
id Float64 Codec(Delta, LZ4HC)
) ENGINE MergeTree() ORDER BY tuple();
CREATE TABLE test.default_codec_float
(
id Float64 Codec(LZ4HC)
) ENGINE MergeTree() ORDER BY tuple();
INSERT INTO test.delta_codec_float SELECT number FROM numbers(1547510400, 500000) WHERE number % 3 == 0 OR number % 5 == 0 OR number % 7 == 0 OR number % 11 == 0;
INSERT INTO test.default_codec_float SELECT * from test.delta_codec_float;
OPTIMIZE TABLE test.delta_codec_float FINAL;
OPTIMIZE TABLE test.default_codec_float FINAL;
SELECT
floor(big_size / small_size) as ratio
FROM
(SELECT 1 AS key, sum(bytes_on_disk) AS small_size FROM system.parts WHERE database = 'test' and table = 'delta_codec_float')
INNER JOIN
(SELECT 1 AS key, sum(bytes_on_disk) as big_size FROM system.parts WHERE database = 'test' and table = 'default_codec_float') USING(key);
SELECT
small_hash == big_hash
FROM
(SELECT 1 AS key, sum(cityHash64(*)) AS small_hash FROM test.delta_codec_float)
INNER JOIN
(SELECT 1 AS key, sum(cityHash64(*)) AS big_hash FROM test.default_codec_float)
USING(key);
DROP TABLE IF EXISTS test.delta_codec_float;
DROP TABLE IF EXISTS test.default_codec_float;
DROP TABLE IF EXISTS test.delta_codec_string;
DROP TABLE IF EXISTS test.default_codec_string;
CREATE TABLE test.delta_codec_string
(
id Float64 Codec(Delta, LZ4)
) ENGINE MergeTree() ORDER BY tuple();
CREATE TABLE test.default_codec_string
(
id Float64 Codec(LZ4)
) ENGINE MergeTree() ORDER BY tuple();
INSERT INTO test.delta_codec_string SELECT concat(toString(number), toString(number % 100)) FROM numbers(1547510400, 500000);
INSERT INTO test.default_codec_string SELECT * from test.delta_codec_string;
OPTIMIZE TABLE test.delta_codec_string FINAL;
OPTIMIZE TABLE test.default_codec_string FINAL;
SELECT
floor(big_size / small_size) as ratio
FROM
(SELECT 1 AS key, sum(bytes_on_disk) AS small_size FROM system.parts WHERE database = 'test' and table = 'delta_codec_string')
INNER JOIN
(SELECT 1 AS key, sum(bytes_on_disk) as big_size FROM system.parts WHERE database = 'test' and table = 'default_codec_string') USING(key);
SELECT
small_hash == big_hash
FROM
(SELECT 1 AS key, sum(cityHash64(*)) AS small_hash FROM test.delta_codec_string)
INNER JOIN
(SELECT 1 AS key, sum(cityHash64(*)) AS big_hash FROM test.default_codec_string)
USING(key);
DROP TABLE IF EXISTS test.delta_codec_string;
DROP TABLE IF EXISTS test.default_codec_string;

View File

@ -0,0 +1,35 @@
*** Test fetches ***
*** replica 1 ***
1 1
2 2
*** replica 2 ***
1 1
2 2
*** Test merges ***
*** replica 1 ***
all_0_1_1 1
all_0_1_1 2
*** replica 2 ***
all_0_1_1 1
all_0_1_1 2
*** Test part removal ***
*** replica 1 ***
all_0_1_1
all_0_1_1
*** replica 2 ***
all_0_1_1
all_0_1_1
*** Test ALTER ***
*** replica 1 ***
1 1
2 1
*** replica 2 ***
1 1
2 1
*** Test CLEAR COLUMN ***
*** replica 1 ***
1 0
2 0
*** replica 2 ***
1 0
2 0

View File

@ -0,0 +1,61 @@
DROP TABLE IF EXISTS test.part_header_r1;
DROP TABLE IF EXISTS test.part_header_r2;
CREATE TABLE test.part_header_r1(x UInt32, y UInt32)
ENGINE ReplicatedMergeTree('/clickhouse/tables/test/part_header', '1') ORDER BY x
SETTINGS use_minimalistic_part_header_in_zookeeper = 0,
old_parts_lifetime = 1,
cleanup_delay_period = 0,
cleanup_delay_period_random_add = 0;
CREATE TABLE test.part_header_r2(x UInt32, y UInt32)
ENGINE ReplicatedMergeTree('/clickhouse/tables/test/part_header', '2') ORDER BY x
SETTINGS use_minimalistic_part_header_in_zookeeper = 1,
old_parts_lifetime = 1,
cleanup_delay_period = 0,
cleanup_delay_period_random_add = 0;
SELECT '*** Test fetches ***';
INSERT INTO test.part_header_r1 VALUES (1, 1);
INSERT INTO test.part_header_r2 VALUES (2, 2);
SYSTEM SYNC REPLICA test.part_header_r1;
SYSTEM SYNC REPLICA test.part_header_r2;
SELECT '*** replica 1 ***';
SELECT x, y FROM test.part_header_r1 ORDER BY x;
SELECT '*** replica 2 ***';
SELECT x, y FROM test.part_header_r2 ORDER BY x;
SELECT '*** Test merges ***';
OPTIMIZE TABLE test.part_header_r1;
SYSTEM SYNC REPLICA test.part_header_r2;
SELECT '*** replica 1 ***';
SELECT _part, x FROM test.part_header_r1 ORDER BY x;
SELECT '*** replica 2 ***';
SELECT _part, x FROM test.part_header_r2 ORDER BY x;
SELECT sleep(2) FORMAT Null;
SELECT '*** Test part removal ***';
SELECT '*** replica 1 ***';
SELECT name FROM system.parts WHERE database = 'test' AND table = 'part_header_r1';
SELECT name FROM system.zookeeper WHERE path = '/clickhouse/tables/test/part_header/replicas/1/parts';
SELECT '*** replica 2 ***';
SELECT name FROM system.parts WHERE database = 'test' AND table = 'part_header_r2';
SELECT name FROM system.zookeeper WHERE path = '/clickhouse/tables/test/part_header/replicas/1/parts';
SELECT '*** Test ALTER ***';
ALTER TABLE test.part_header_r1 MODIFY COLUMN y String;
SELECT '*** replica 1 ***';
SELECT x, length(y) FROM test.part_header_r1 ORDER BY x;
SELECT '*** replica 2 ***';
SELECT x, length(y) FROM test.part_header_r2 ORDER BY x;
SELECT '*** Test CLEAR COLUMN ***';
SET replication_alter_partitions_sync = 2;
ALTER TABLE test.part_header_r1 CLEAR COLUMN y IN PARTITION tuple();
SELECT '*** replica 1 ***';
SELECT x, length(y) FROM test.part_header_r1 ORDER BY x;
SELECT '*** replica 2 ***';
SELECT x, length(y) FROM test.part_header_r2 ORDER BY x;
DROP TABLE test.part_header_r1;
DROP TABLE test.part_header_r2;

View File

@ -5,13 +5,52 @@ set -e
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
. $CURDIR/../shell_config.sh
#create the scjema file
echo "@0x803231eaa402b968;struct NestedNestedOne {nestednestednumber @0 :UInt64;}struct NestedNestedTwo {nestednestedtext @0 :Text;}struct NestedOne {nestednestedone @0 :NestedNestedOne;nestednestedtwo @1 :NestedNestedTwo;nestednumber @2:UInt64;}struct NestedTwo {nestednestedone @0 :NestedNestedOne;nestednestedtwo @1 : NestedNestedTwo;nestedtext @2 :Text;}struct CapnProto { number @0 :UInt64;string @1 :Text;nestedone @2 : NestedOne;nestedtwo @3 : NestedTwo;nestedthree @4 : NestedNestedTwo;}" > test.capnp
#create the schema file
echo "
@0x803231eaa402b968;
struct NestedNestedOne
{
nestednestednumber @0 : UInt64;
}
struct NestedNestedTwo
{
nestednestedtext @0 : Text;
}
struct NestedOne
{
nestednestedone @0 : NestedNestedOne;
nestednestedtwo @1 : NestedNestedTwo;
nestednumber @2: UInt64;
}
struct NestedTwo
{
nestednestedone @0 : NestedNestedOne;
nestednestedtwo @1 : NestedNestedTwo;
nestedtext @2 : Text;
}
struct CapnProto
{
number @0 : UInt64;
string @1 : Text;
nestedone @2 : NestedOne;
nestedtwo @3 : NestedTwo;
nestedthree @4 : NestedNestedTwo;
}" > test.capnp
$CLICKHOUSE_CLIENT -q "DROP TABLE IF EXISTS test.capnproto_input"
$CLICKHOUSE_CLIENT -q "CREATE TABLE test.capnproto_input ( number UInt64,string String,nestedone_nestednumber UInt64,nestedone_nestednestedone_nestednestednumber UInt64,nestedone_nestednestedtwo_nestednestedtext String,nestedtwo_nestednestedtwo_nestednestedtext String,nestedtwo_nestednestedone_nestednestednumber UInt64,nestedtwo_nestedtext String) ENGINE = Memory"
$CLICKHOUSE_CLIENT -q "CREATE TABLE test.capnproto_input
(
number UInt64,
string String,
nestedone_nestednumber UInt64,
nestedone_nestednestedone_nestednestednumber UInt64,
nestedone_nestednestedtwo_nestednestedtext String,
nestedtwo_nestednestedtwo_nestednestedtext String,
nestedtwo_nestednestedone_nestednestednumber UInt64,
nestedtwo_nestedtext String
) ENGINE = Memory"
echo -ne '\x00\x00\x00\x00\x15\x00\x00\x00\x00\x00\x00\x00\x01\x00\x04\x00\x01\x00\x00\x00\x00\x00\x00\x00\x0d\x00\x00\x00\x12\x00\x00\x00\x0c\x00\x00\x00\x01\x00\x02\x00\x20\x00\x00\x00\x00\x00\x03\x00\x34\x00\x00\x00\x00\x00\x01\x00\x32\x00\x00\x00\x00\x00\x00\x00\x05\x00\x00\x00\x00\x00\x00\x00\x04\x00\x00\x00\x01\x00\x00\x00\x04\x00\x00\x00\x00\x00\x01\x00\x03\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x12\x00\x00\x00\x34\x00\x00\x00\x00\x00\x00\x00\x08\x00\x00\x00\x01\x00\x00\x00\x08\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x06\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x12\x00\x00\x00\x37\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x12\x00\x00\x00\x39\x00\x00\x00\x00\x00\x00\x00' | $CLICKHOUSE_CLIENT --stacktrace --format_schema='test:CapnProto' --query="INSERT INTO test.capnproto_input FORMAT CapnProto";
echo -ne '\x00\x00\x00\x00\x15\x00\x00\x00\x00\x00\x00\x00\x01\x00\x04\x00\x01\x00\x00\x00\x00\x00\x00\x00\x0d\x00\x00\x00\x12\x00\x00\x00\x0c\x00\x00\x00\x01\x00\x02\x00\x20\x00\x00\x00\x00\x00\x03\x00\x34\x00\x00\x00\x00\x00\x01\x00\x32\x00\x00\x00\x00\x00\x00\x00\x05\x00\x00\x00\x00\x00\x00\x00\x04\x00\x00\x00\x01\x00\x00\x00\x04\x00\x00\x00\x00\x00\x01\x00\x03\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x12\x00\x00\x00\x34\x00\x00\x00\x00\x00\x00\x00\x08\x00\x00\x00\x01\x00\x00\x00\x08\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x06\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x12\x00\x00\x00\x37\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x12\x00\x00\x00\x39\x00\x00\x00\x00\x00\x00\x00' | $CLICKHOUSE_CLIENT --stacktrace --format_schema='test:CapnProto' --query="INSERT INTO test.capnproto_input FORMAT CapnProto";
$CLICKHOUSE_CLIENT -q "SELECT * FROM test.capnproto_input"
$CLICKHOUSE_CLIENT -q "DROP TABLE test.capnproto_input"

View File

@ -0,0 +1,25 @@
-- this test cannot pass without the new DFA matching algorithm of sequenceMatch
DROP TABLE IF EXISTS test.sequence;
CREATE TABLE test.sequence
(
userID UInt64,
eventType Enum8('A' = 1, 'B' = 2, 'C' = 3),
EventTime UInt64
)
ENGINE = Memory;
INSERT INTO test.sequence SELECT 1, number = 0 ? 'A' : (number < 1000000 ? 'B' : 'C'), number FROM numbers(1000001);
SELECT userID
FROM test.sequence
GROUP BY userID
HAVING sequenceMatch('(?1).*(?2).*(?3)')(toDateTime(EventTime), eventType = 'A', eventType = 'B', eventType = 'C');
SELECT userID
FROM test.sequence
GROUP BY userID
HAVING sequenceMatch('(?1).*(?2).*(?3)')(toDateTime(EventTime), eventType = 'A', eventType = 'B', eventType = 'A');
DROP TABLE test.sequence;

View File

@ -0,0 +1,16 @@
-- triggers assertion in debug build
DROP TABLE IF EXISTS test.rollup_having;
CREATE TABLE test.rollup_having (
a Nullable(String),
b Nullable(String)
) ENGINE = Memory;
INSERT INTO test.rollup_having VALUES (NULL, NULL);
INSERT INTO test.rollup_having VALUES ('a', NULL);
INSERT INTO test.rollup_having VALUES ('a', 'b');
SELECT a, b, count(*) FROM test.rollup_having GROUP BY a, b WITH ROLLUP WITH TOTALS HAVING a IS NOT NULL;
SELECT a, b, count(*) FROM test.rollup_having GROUP BY a, b WITH ROLLUP WITH TOTALS HAVING a IS NOT NULL and b IS NOT NULL;
DROP TABLE test.rollup_having;

4
debian/changelog vendored
View File

@ -1,5 +1,5 @@
clickhouse (19.1.0) unstable; urgency=low
clickhouse (19.1.1) unstable; urgency=low
* Modified source code
-- <root@yandex-team.ru> Tue, 01 Jan 2019 07:16:20 +0300
-- <root@yandex-team.ru> Wed, 16 Jan 2019 14:04:37 +0300

View File

@ -1,7 +1,7 @@
FROM ubuntu:18.04
ARG repository="deb http://repo.yandex.ru/clickhouse/deb/stable/ main/"
ARG version=19.1.0
ARG version=19.1.1
RUN apt-get update \
&& apt-get install --yes --no-install-recommends \

View File

@ -1,7 +1,7 @@
FROM ubuntu:18.04
ARG repository="deb http://repo.yandex.ru/clickhouse/deb/stable/ main/"
ARG version=19.1.0
ARG version=19.1.1
ARG gosu_ver=1.10
RUN apt-get update \

View File

@ -1,7 +1,7 @@
FROM ubuntu:18.04
ARG repository="deb http://repo.yandex.ru/clickhouse/deb/stable/ main/"
ARG version=19.1.0
ARG version=19.1.1
RUN apt-get update && \
apt-get install -y apt-transport-https dirmngr && \

View File

@ -18,7 +18,8 @@ RUN apt-get update -y \
sudo \
openssl \
netcat-openbsd \
telnet
telnet \
moreutils
ENV TZ=Europe/Moscow
RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
@ -29,8 +30,14 @@ COPY part_log.xml /etc/clickhouse-server/config.d/part_log.xml
COPY log_queries.xml /etc/clickhouse-server/users.d/log_queries.xml
CMD dpkg -i package_folder/clickhouse-common-static_*.deb; \
dpkg -i package_folder/clickhouse-common-static-dbg_*.deb; \
dpkg -i package_folder/clickhouse-server_*.deb; \
dpkg -i package_folder/clickhouse-client_*.deb; \
dpkg -i package_folder/clickhouse-test_*.deb; \
echo "TSAN_OPTIONS='halt_on_error=1'" >> /etc/environment; \
echo "UBSAN_OPTIONS='print_stacktrace=1'" >> /etc/environment; \
echo "ASAN_SYMBOLIZER_PATH=/usr/lib/llvm-6.0/bin/llvm-symbolizer" >> /etc/environment; \
echo "UBSAN_SYMBOLIZER_PATH=/usr/lib/llvm-6.0/bin/llvm-symbolizer" >> /etc/environment; \
echo "LLVM_SYMBOLIZER_PATH=/usr/lib/llvm-6.0/bin/llvm-symbolizer" >> /etc/environment; \
service zookeeper start; sleep 5; \
service clickhouse-server start && sleep 5 && clickhouse-test --shard --zookeeper $SKIP_TESTS_OPTION 2>&1 | tee test_output/test_result.txt
service clickhouse-server start && sleep 5 && clickhouse-test --shard --zookeeper $SKIP_TESTS_OPTION 2>&1 | ts '%Y-%m-%d %H:%M:%S' | tee test_output/test_result.txt

View File

@ -1,4 +1,4 @@
FROM ubuntu:18.10
FROM yandex/clickhouse-deb-builder
RUN apt-get update -y \
&& env DEBIAN_FRONTEND=noninteractive \
@ -24,7 +24,14 @@ COPY log_queries.xml /etc/clickhouse-server/users.d/log_queries.xml
COPY part_log.xml /etc/clickhouse-server/config.d/part_log.xml
CMD dpkg -i package_folder/clickhouse-common-static_*.deb; \
dpkg -i package_folder/clickhouse-common-static-dbg_*.deb; \
dpkg -i package_folder/clickhouse-server_*.deb; \
dpkg -i package_folder/clickhouse-client_*.deb; \
dpkg -i package_folder/clickhouse-test_*.deb; \
echo "TSAN_OPTIONS='halt_on_error=1'" >> /etc/environment; \
echo "UBSAN_OPTIONS='print_stacktrace=1'" >> /etc/environment; \
echo "ASAN_SYMBOLIZER_PATH=/usr/lib/llvm-6.0/bin/llvm-symbolizer" >> /etc/environment; \
echo "UBSAN_SYMBOLIZER_PATH=/usr/lib/llvm-6.0/bin/llvm-symbolizer" >> /etc/environment; \
echo "TSAN_SYMBOLIZER_PATH=/usr/lib/llvm-6.0/bin/llvm-symbolizer" >> /etc/environment; \
echo "LLVM_SYMBOLIZER_PATH=/usr/lib/llvm-6.0/bin/llvm-symbolizer" >> /etc/environment; \
service clickhouse-server start && sleep 1 && ./stress --output-folder test_output

View File

@ -0,0 +1,60 @@
# UUID {#uuid-data-type}
Universally unique identifier (UUID) is a 16-byte number used to identify the records. For detailed information about the UUID, see [Wikipedia](https://en.wikipedia.org/wiki/Universally_unique_identifier).
The example of UUID type value is represented below:
```
61f0c404-5cb3-11e7-907b-a6006ad3dba0
```
If you do not specify the UUID column value when inserting a new record, the UUID value is filled with zero:
```
00000000-0000-0000-0000-000000000000
```
## How to generate
To generate the UUID value, ClickHouse provides the [generateUUIDv4](../query_language/functions/uuid_functions.md) function.
## Usage example
**Example 1**
This example demonstrates creating a table with the UUID type column and inserting a value into the table.
``` sql
:) CREATE TABLE t_uuid (x UUID, y String) ENGINE=TinyLog
:) INSERT INTO t_uuid SELECT generateUUIDv4(), 'Example 1'
:) SELECT * FROM t_uuid
┌────────────────────────────────────x─┬─y─────────┐
│ 417ddc5d-e556-4d27-95dd-a34d84e46a50 │ Example 1 │
└──────────────────────────────────────┴───────────┘
```
**Example 2**
In this example, the UUID column value is not specified when inserting a new record.
``` sql
:) INSERT INTO t_uuid (y) VALUES ('Example 2')
:) SELECT * FROM t_uuid
┌────────────────────────────────────x─┬─y─────────┐
│ 417ddc5d-e556-4d27-95dd-a34d84e46a50 │ Example 1 │
│ 00000000-0000-0000-0000-000000000000 │ Example 2 │
└──────────────────────────────────────┴───────────┘
```
## Restrictions
The UUID data type only supports functions which [String](string.md) data type also supports (for example, [min](../query_language/agg_functions/reference.md#agg_function-min), [max](../query_language/agg_functions/reference.md#agg_function-max), and [count](../query_language/agg_functions/reference.md#agg_function-count)).
The UUID data type is not supported by arithmetic operations (for example, [abs](../query_language/functions/arithmetic_functions.md#arithm_func-abs)) neither aggregate functions, such as [sum](../query_language/agg_functions/reference.md#agg_function-sum) and [avg](../query_language/agg_functions/reference.md#agg_function-avg).
[Original article](https://clickhouse.yandex/docs/en/data_types/uuid/) <!--hide-->

View File

@ -1,7 +1,7 @@
# Function reference
## count()
## count() {#agg_function-count}
Counts the number of rows. Accepts zero arguments and returns UInt64.
The syntax `COUNT(DISTINCT x)` is not supported. The separate `uniq` aggregate function exists for this purpose.
@ -179,15 +179,15 @@ binary decimal
01101000 = 104
```
## min(x)
## min(x) {#agg_function-min}
Calculates the minimum.
## max(x)
## max(x) {#agg_function-max}
Calculates the maximum.
## argMin(arg, val)
## argMin(arg, val) {#agg_function-argMin}
Calculates the 'arg' value for a minimal 'val' value. If there are several different values of 'arg' for minimal values of 'val', the first of these values encountered is output.
@ -206,7 +206,7 @@ SELECT argMin(user, salary) FROM salary
└──────────────────────┘
```
## argMax(arg, val)
## argMax(arg, val) {#agg_function-argMax}
Calculates the 'arg' value for a maximum 'val' value. If there are several different values of 'arg' for maximum values of 'val', the first of these values encountered is output.
@ -259,7 +259,7 @@ GROUP BY timeslot
└─────────────────────┴──────────────────────────────────────────────┘
```
## avg(x)
## avg(x) {#agg_function-avg}
Calculates the average.
Only works for numbers.

View File

@ -59,7 +59,7 @@ An exception is thrown when dividing by zero or when dividing a minimal negative
Calculates a number with the reverse sign. The result is always signed.
## abs(a)
## abs(a) {#arithm_func-abs}
Calculates the absolute value of the number (a). That is, if a &lt; 0, it returns -a. For unsigned types it doesn't do anything. For signed integer types, it returns an unsigned number.

View File

@ -1,6 +1,6 @@
# Functions for working with external dictionaries {#ext_dict_functions}
For information on connecting and configuring external dictionaries, see "[External dictionaries](../dicts/external_dicts.md)".
For information on connecting and configuring external dictionaries, see [External dictionaries](../dicts/external_dicts.md).
## dictGetUInt8, dictGetUInt16, dictGetUInt32, dictGetUInt64
@ -19,7 +19,7 @@ For information on connecting and configuring external dictionaries, see "[Exter
- Get the value of the attr_name attribute from the dict_name dictionary using the 'id' key.`dict_name` and `attr_name` are constant strings.`id`must be UInt64.
If there is no `id` key in the dictionary, it returns the default value specified in the dictionary description.
## dictGetTOrDefault
## dictGetTOrDefault {#ext_dict_functions_dictGetTOrDefault}
`dictGetT('dict_name', 'attr_name', id, default)`

View File

@ -0,0 +1,108 @@
# Functions for working with UUID
The functions for working with UUID are listed below.
## generateUUIDv4 {#uuid_function-generate}
Generates [UUID](../../data_types/uuid.md) of [version 4](https://tools.ietf.org/html/rfc4122#section-4.4).
```sql
generateUUIDv4()
```
**Returned value**
The UUID type value.
**Usage example**
This example demonstrates creating a table with the UUID type column and inserting a value into the table.
``` sql
:) CREATE TABLE t_uuid (x UUID) ENGINE=TinyLog
:) INSERT INTO t_uuid SELECT generateUUIDv4()
:) SELECT * FROM t_uuid
┌────────────────────────────────────x─┐
│ f4bf890f-f9dc-4332-ad5c-0c18e73f28e9 │
└──────────────────────────────────────┘
```
## toUUID (x)
Converts String type value to UUID type.
```sql
toUUID(String)
```
**Returned value**
The UUID type value.
**Usage example**
``` sql
:) SELECT toUUID('61f0c404-5cb3-11e7-907b-a6006ad3dba0') AS uuid
┌─────────────────────────────────uuid─┐
│ 61f0c404-5cb3-11e7-907b-a6006ad3dba0 │
└──────────────────────────────────────┘
```
## UUIDStringToNum
Accepts a string containing 36 characters in the format `xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx`, and returns it as a set of bytes in a [FixedString(16)](../../data_types/fixedstring.md).
``` sql
UUIDStringToNum(String)
```
**Returned value**
FixedString(16)
**Usage examples**
``` sql
:) SELECT
'612f3c40-5d3b-217e-707b-6a546a3d7b29' AS uuid,
UUIDStringToNum(uuid) AS bytes
┌─uuid─────────────────────────────────┬─bytes────────────┐
│ 612f3c40-5d3b-217e-707b-6a546a3d7b29 │ a/<@];!~p{jTj={) │
└──────────────────────────────────────┴──────────────────┘
```
## UUIDNumToString
Accepts a [FixedString(16)](../../data_types/fixedstring.md) value, and returns a string containing 36 characters in text format.
``` sql
UUIDNumToString(FixedString(16))
```
**Returned value**
String.
**Usage example**
``` sql
SELECT
'a/<@];!~p{jTj={)' AS bytes,
UUIDNumToString(toFixedString(bytes, 16)) AS uuid
┌─bytes────────────┬─uuid─────────────────────────────────┐
│ a/<@];!~p{jTj={) │ 612f3c40-5d3b-217e-707b-6a546a3d7b29 │
└──────────────────┴──────────────────────────────────────┘
```
## See also
- [dictGetUUID](ext_dict_functions.md)
- [dictGetUUIDOrDefault](ext_dict_functions#ext_dict_functions_dictGetTOrDefault)
[Original article](https://clickhouse.yandex/docs/en/query_language/functions/uuid_function/) <!--hide-->

View File

@ -26,52 +26,61 @@ SQL-style and C-style comments are supported.
SQL-style comments: from `--` to the end of the line. The space after `--` can be omitted.
Comments in C-style: from `/*` to `*/`. These comments can be multiline. Spaces are not required here, either.
## Keywords
## Keywords {#syntax-keywords}
Keywords (such as `SELECT`) are not case-sensitive. Everything else (column names, functions, and so on), in contrast to standard SQL, is case-sensitive. Keywords are not reserved (they are just parsed as keywords in the corresponding context).
Keywords (such as `SELECT`) are not case-sensitive. Everything else (column names, functions, and so on), in contrast to standard SQL, is case-sensitive.
## Identifiers
Keywords are not reserved (they are just parsed as keywords in the corresponding context). If you use [identifiers](#syntax-identifiers) the same as the keywords, enclose them into quotes. For example, the query `SELECT "FROM" FROM table_name` is valid if the table `table_name` has column with the name `"FROM"`.
Identifiers (column names, functions, and data types) can be quoted or non-quoted.
Non-quoted identifiers start with a Latin letter or underscore, and continue with a Latin letter, underscore, or number. In other words, they must match the regex `^[a-zA-Z_][0-9a-zA-Z_]*$`. Examples: `x, _1, X_y__Z123_.`
## Identifiers {#syntax-identifiers}
Quoted identifiers are placed in reversed quotation marks `` `id` `` (the same as in MySQL), and can indicate any set of bytes (non-empty). In addition, symbols (for example, the reverse quotation mark) inside this type of identifier can be backslash-escaped. Escaping rules are the same as for string literals (see below).
We recommend using identifiers that do not need to be quoted.
Identifiers are:
- Cluster, database, table, partition and column names;
- Functions;
- Data types;
- [Expression aliases](#syntax-expression_aliases).
Identifiers can be quoted or non-quoted. It is recommended to use non-quoted identifiers.
Non-quoted identifiers must match the regex `^[a-zA-Z_][0-9a-zA-Z_]*$` and can not be equal to [keywords](#syntax-keywords). Examples: `x, _1, X_y__Z123_.`
If you want to use identifiers the same as keywords or you want to use other symbols in identifiers, quote it using double quotes or backticks, for example, `"id"`, `` `id` ``.
## Literals
There are numeric literals, string literals, and compound literals.
There are: numeric, string, compound and `NULL` literals.
### Numeric Literals
### Numeric
A numeric literal tries to be parsed:
- First as a 64-bit signed number, using the 'strtoull' function.
- If unsuccessful, as a 64-bit unsigned number, using the 'strtoll' function.
- If unsuccessful, as a floating-point number using the 'strtod' function.
- First as a 64-bit signed number, using the [strtoull](https://en.cppreference.com/w/cpp/string/byte/strtoul) function.
- If unsuccessful, as a 64-bit unsigned number, using the [strtoll](https://en.cppreference.com/w/cpp/string/byte/strtol) function.
- If unsuccessful, as a floating-point number using the [strtod](https://en.cppreference.com/w/cpp/string/byte/strtof) function.
- Otherwise, an error is returned.
The corresponding value will have the smallest type that the value fits in.
For example, 1 is parsed as UInt8, but 256 is parsed as UInt16. For more information, see "Data types".
For example, 1 is parsed as `UInt8`, but 256 is parsed as `UInt16`. For more information, see [Data types](../data_types/index.md).
Examples: `1`, `18446744073709551615`, `0xDEADBEEF`, `01`, `0.1`, `1e100`, `-1e-100`, `inf`, `nan`.
### String Literals
### String
Only string literals in single quotes are supported. The enclosed characters can be backslash-escaped. The following escape sequences have a corresponding special value: `\b`, `\f`, `\r`, `\n`, `\t`, `\0`, `\a`, `\v`, `\xHH`. In all other cases, escape sequences in the format `\c`, where "c" is any character, are converted to "c". This means that you can use the sequences `\'`and`\\`. The value will have the String type.
Only string literals in single quotes are supported. The enclosed characters can be backslash-escaped. The following escape sequences have a corresponding special value: `\b`, `\f`, `\r`, `\n`, `\t`, `\0`, `\a`, `\v`, `\xHH`. In all other cases, escape sequences in the format `\c`, where `c` is any character, are converted to `c`. This means that you can use the sequences `\'`and`\\`. The value will have the String type.
The minimum set of characters that you need to escape in string literals: `'` and `\`.
The minimum set of characters that you need to escape in string literals: `'` and `\`. Single quote can be escaped with the single quote, literals `'It\'s'` and `'It''s'` are equal.
### Compound Literals
### Compound
Constructions are supported for arrays: `[1, 2, 3]` and tuples: `(1, 'Hello, world!', 2)`..
Actually, these are not literals, but expressions with the array creation operator and the tuple creation operator, respectively.
For more information, see the section "Operators2".
An array must consist of at least one item, and a tuple must have at least two items.
Tuples have a special purpose for use in the IN clause of a SELECT query. Tuples can be obtained as the result of a query, but they can't be saved to a database (with the exception of Memory-type tables).
Tuples have a special purpose for use in the `IN` clause of a `SELECT` query. Tuples can be obtained as the result of a query, but they can't be saved to a database (with the exception of [Memory](../operations/table_engines/memory.md) tables).
### NULL Literal {#null-literal}
### NULL {#null-literal}
Indicates that the value is missing.
@ -92,13 +101,13 @@ There are regular and aggregate functions (see the section "Aggregate functions"
Operators are converted to their corresponding functions during query parsing, taking their priority and associativity into account.
For example, the expression `1 + 2 * 3 + 4` is transformed to `plus(plus(1, multiply(2, 3)), 4)`.
For more information, see the section "Operators" below.
## Data Types and Database Table Engines
Data types and table engines in the `CREATE` query are written the same way as identifiers or functions. In other words, they may or may not contain an arguments list in brackets. For more information, see the sections "Data types," "Table engines," and "CREATE".
## Expression Aliases
## Expression Aliases {#syntax-expression_aliases}
Alias is a user defined name for an expression in a query.
@ -106,27 +115,27 @@ Alias is a user defined name for an expression in a query.
expr AS alias
```
- `AS`keyword for defining aliases. You can define alias for a table name or a column name in SELECT clause skipping `AS` keyword.
- `AS`Keyword for defining aliases. You can define alias for a table name or a column name in the `SELECT` clause skipping `AS` keyword.
For example, `SELECT b.column_name from t b`.
For example, `SELECT table_name_alias.column_name FROM table_name table_name_alias`.
In the [CAST function](functions/type_conversion_functions.md), the `AS` keyword has another meaning. See the description of the function.
- `expr`any expression supported by ClickHouse.
- `expr`Any expression supported by ClickHouse.
For example `SELECT column_name * 2 AS double FROM some_table`.
- `alias`[string literal](#string-literals). If an alias contains spaces, enclose it in double quotes or backticks.
- `alias`Name for `expr`. Aliases should comply with the [identifiers](#syntax-identifiers) syntax.
For example, `SELECT "table t".col_name FROM t AS "table t"`.
For example, `SELECT "table t".column_name FROM table_name AS "table t"`.
### Peculiarities of Use
Aliases are global for a query or subquery and you can define alias in any part of a query for any expression. For example, `SELECT (1 AS n) + 2, n`.
Aliases are not visible in between subqueries. For example, while executing the query `SELECT (SELECT sum(b.a) + num FROM b) - a.a AS num FROM a` ClickHouse generates exception `Unknown identifier: num`.
Aliases are not visible in subqueries. For example, while executing the query `SELECT (SELECT sum(b.a) + num FROM b) - a.a AS num FROM a` ClickHouse generates the exception `Unknown identifier: num`.
If alias is defined for result columns in SELECT clause in a subquery, these columns are visible in outer query. For example, `SELECT n + m FROM (SELECT 1 AS n, 2 AS m)`.
If an alias is defined for result columns in `SELECT` clause in a subquery, these columns are visible in outer query. For example, `SELECT n + m FROM (SELECT 1 AS n, 2 AS m)`.
Be careful with aliases the same as column or table names. Let's consider the following example:

View File

@ -1,5 +1,5 @@
# remote
# remote, remoteSecure
Allows you to access remote servers without creating a `Distributed` table.
@ -72,5 +72,6 @@ The `remote` table function can be useful in the following cases:
If the user is not specified, `default` is used.
If the password is not specified, an empty password is used.
`remoteSecure` - same as `remote` but with secured connection. Default port - `tcp_port_secure` from config or 9440.
[Original article](https://clickhouse.yandex/docs/en/query_language/table_functions/remote/) <!--hide-->

View File

@ -50,7 +50,7 @@ sudo apt-get install clickhouse-client clickhouse-server
### Из исходникого кода
Для компиляции ClickHouse вручную, испольщуйте инструкцию для [Linux](../development/build.md) или [Mac OS X](../development/build_osx.md).
Для компиляции ClickHouse вручную, используйте инструкцию для [Linux](../development/build.md) или [Mac OS X](../development/build_osx.md).
Можно скомпилировать пакеты и установить их, либо использовать программы без установки пакетов. Также при ручой сборке можно отключить необходимость поддержки набора инструкций SSE 4.2 или собрать под процессоры архитектуры AArch64.
@ -97,7 +97,7 @@ $ clickhouse-client
По умолчанию он соединяется с localhost:9000, от имени пользователя `default` без пароля. Также клиент может быть использован для соединения с удалённым сервером с помощью аргемента `--host`.
Терминал должен использлвать кодировку UTF-8.
Терминал должен использовать кодировку UTF-8.
Более подробная информация о клиенте располагается в разделе [«Клиент командной строки»](../interfaces/cli.md).

View File

@ -157,7 +157,7 @@ x=1 y=\N
clickhouse-client --format_csv_delimiter="|" --query="INSERT INTO test.csv FORMAT CSV" < data.csv
```
&ast;По умолчанию — `,`. См. настройку [format_csv_delimiter](/operations/settings/settings/#settings-format_csv_delimiter) для дополнительной информации.
&ast;По умолчанию — `,`. См. настройку [format_csv_delimiter](../operations/settings/settings.md#settings-format_csv_delimiter) для дополнительной информации.
При парсинге, все значения могут парситься как в кавычках, так и без кавычек. Поддерживаются как двойные, так и одинарные кавычки. В том числе, строки могут быть расположены без кавычек - тогда они парсятся до символа-разделителя или перевода строки (CR или LF). В нарушение RFC, в случае парсинга строк не в кавычках, начальные и конечные пробелы и табы игнорируются. В качестве перевода строки, поддерживаются как Unix (LF), так и Windows (CR LF) и Mac OS Classic (LF CR) варианты.

View File

@ -245,7 +245,7 @@ TRUNCATE TABLE [IF EXISTS] [db.]name [ON CLUSTER cluster]
Удаляет все данные из таблицы. Если условие `IF EXISTS` не указано, запрос вернет ошибку, если таблицы не существует.
Запрос `TRUNCATE` не поддерживается для следующих движков: [View](../operations/table_engines/view.md), [File](../operations/table_engines/file.md), [URL](../operations/table_engines/url.md) and [Null](../operations/table_engines/null.md).
Запрос `TRUNCATE` не поддерживается для следующих движков: [View](../operations/table_engines/view.md), [File](../operations/table_engines/file.md), [URL](../operations/table_engines/url.md) и [Null](../operations/table_engines/null.md).
## USE

View File

@ -1,5 +1,5 @@
# remote
# remote, remoteSecure
Позволяет обратиться к удалённым серверам без создания таблицы типа `Distributed`.
@ -72,4 +72,6 @@ example01-{01..02}-{1|2}
Если пользователь не задан,то используется `default`.
Если пароль не задан, то используется пустой пароль.
`remoteSecure` - аналогично функции `remote` но с соединением по шифрованому каналу. Порт по умолчанию - `tcp_port_secure` из конфига или 9440.
[Оригинальная статья](https://clickhouse.yandex/docs/ru/query_language/table_functions/remote/) <!--hide-->

View File

@ -39,6 +39,7 @@ nav:
- 'Boolean': 'data_types/boolean.md'
- 'String': 'data_types/string.md'
- 'FixedString(N)': 'data_types/fixedstring.md'
- 'UUID': 'data_types/uuid.md'
- 'Date': 'data_types/date.md'
- 'DateTime': 'data_types/datetime.md'
- 'Enum': 'data_types/enum.md'
@ -81,6 +82,7 @@ nav:
- 'Hash': 'query_language/functions/hash_functions.md'
- 'Generating Pseudo-Random Numbers': 'query_language/functions/random_functions.md'
- 'Encoding': 'query_language/functions/encoding_functions.md'
- 'Working with UUID': 'query_language/functions/uuid_functions.md'
- 'Working with URLs': 'query_language/functions/url_functions.md'
- 'Working with IP Addresses': 'query_language/functions/ip_address_functions.md'
- 'Working with JSON.': 'query_language/functions/json_functions.md'

Some files were not shown because too many files have changed in this diff Show More