Merge remote-tracking branch 'upstream/master' into fix25

This commit is contained in:
proller 2019-07-16 12:40:12 +03:00
commit ad98560d78
34 changed files with 1151 additions and 295 deletions

View File

@ -1,5 +1,5 @@
if (OS_LINUX AND NOT SANITIZE AND NOT ARCH_ARM AND NOT ARCH_32 AND NOT ARCH_PPC64LE)
option (ENABLE_MIMALLOC "Set to FALSE to disable usage of mimalloc for internal ClickHouse caches" ${NOT_UNBUNDLED})
option (ENABLE_MIMALLOC "Set to FALSE to disable usage of mimalloc for internal ClickHouse caches" FALSE)
endif ()
if (NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/mimalloc/include/mimalloc.h")
@ -8,6 +8,8 @@ if (NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/mimalloc/include/mimalloc.h")
endif ()
if (ENABLE_MIMALLOC)
message (FATAL_ERROR "Mimalloc is not production ready. (Disable with cmake -D ENABLE_MIMALLOC=0). If you want to use mimalloc, you must manually remove this message.")
set (MIMALLOC_INCLUDE_DIR ${ClickHouse_SOURCE_DIR}/contrib/mimalloc/include)
set (USE_MIMALLOC 1)
set (MIMALLOC_LIBRARY mimalloc-static)

View File

@ -5,15 +5,33 @@
#include "MiAllocator.h"
#include <mimalloc.h>
#include <Common/Exception.h>
#include <Common/formatReadable.h>
#include <IO/WriteHelpers.h>
namespace DB
{
namespace ErrorCodes
{
extern const int CANNOT_ALLOCATE_MEMORY;
}
void * MiAllocator::alloc(size_t size, size_t alignment)
{
void * ptr;
if (alignment == 0)
return mi_malloc(size);
{
ptr = mi_malloc(size);
if (!ptr)
DB::throwFromErrno("MiAllocator: Cannot allocate in mimalloc " + formatReadableSizeWithBinarySuffix(size) + ".", DB::ErrorCodes::CANNOT_ALLOCATE_MEMORY);
}
else
return mi_malloc_aligned(size, alignment);
{
ptr = mi_malloc_aligned(size, alignment);
if (!ptr)
DB::throwFromErrno("MiAllocator: Cannot allocate in mimalloc (mi_malloc_aligned) " + formatReadableSizeWithBinarySuffix(size) + " with alignment " + toString(alignment) + ".", DB::ErrorCodes::CANNOT_ALLOCATE_MEMORY);
}
return ptr;
}
void MiAllocator::free(void * buf, size_t)
@ -32,10 +50,21 @@ void * MiAllocator::realloc(void * old_ptr, size_t, size_t new_size, size_t alig
return nullptr;
}
if (alignment == 0)
return mi_realloc(old_ptr, alignment);
void * ptr;
return mi_realloc_aligned(old_ptr, new_size, alignment);
if (alignment == 0)
{
ptr = mi_realloc(old_ptr, alignment);
if (!ptr)
DB::throwFromErrno("MiAllocator: Cannot reallocate in mimalloc " + formatReadableSizeWithBinarySuffix(size) + ".", DB::ErrorCodes::CANNOT_ALLOCATE_MEMORY);
}
else
{
ptr = mi_realloc_aligned(old_ptr, new_size, alignment);
if (!ptr)
DB::throwFromErrno("MiAllocator: Cannot reallocate in mimalloc (mi_realloc_aligned) " + formatReadableSizeWithBinarySuffix(size) + " with alignment " + toString(alignment) + ".", DB::ErrorCodes::CANNOT_ALLOCATE_MEMORY);
}
return ptr;
}
}

View File

@ -75,3 +75,6 @@ target_link_libraries (cow_compositions PRIVATE clickhouse_common_io)
add_executable (stopwatch stopwatch.cpp)
target_link_libraries (stopwatch PRIVATE clickhouse_common_io)
add_executable (mi_malloc_test mi_malloc_test.cpp)
target_link_libraries (mi_malloc_test PRIVATE clickhouse_common_io)

View File

@ -0,0 +1,118 @@
/** In addition to ClickHouse (Apache 2) license, this file can be also used under MIT license:
MIT License
Copyright (c) 2019 Yandex LLC, Alexey Milovidov
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/
#include <map>
#include <vector>
#include <cstdint>
#include <random>
#include <stdexcept>
#include <iostream>
#include <Common/config.h>
//#undef USE_MIMALLOC
//#define USE_MIMALLOC 0
#if USE_MIMALLOC
#include <mimalloc.h>
#define malloc mi_malloc
#define free mi_free
#else
#include <stdlib.h>
#endif
size_t total_size{0};
struct Allocation
{
void * ptr = nullptr;
size_t size = 0;
Allocation() {}
Allocation(size_t size)
: size(size)
{
ptr = malloc(size);
if (!ptr)
throw std::runtime_error("Cannot allocate memory");
total_size += size;
}
~Allocation()
{
if (ptr)
{
free(ptr);
total_size -= size;
}
ptr = nullptr;
}
Allocation(const Allocation &) = delete;
Allocation(Allocation && rhs)
{
ptr = rhs.ptr;
size = rhs.size;
rhs.ptr = nullptr;
rhs.size = 0;
}
};
int main(int, char **)
{
std::vector<Allocation> allocations;
constexpr size_t limit = 100000000;
constexpr size_t min_alloc_size = 65536;
constexpr size_t max_alloc_size = 10000000;
std::mt19937 rng;
auto distribution = std::uniform_int_distribution(min_alloc_size, max_alloc_size);
size_t total_allocations = 0;
while (true)
{
size_t size = distribution(rng);
while (total_size + size > limit)
allocations.pop_back();
allocations.emplace_back(size);
++total_allocations;
if (total_allocations % (1ULL << 20) == 0)
std::cerr << "Total allocations: " << total_allocations << "\n";
}
}

View File

@ -12,6 +12,7 @@
#include <algorithm>
#include <cstdlib>
#include <type_traits>
#include <limits>
namespace DB
{
@ -24,28 +25,23 @@ extern const int CANNOT_DECOMPRESS;
namespace
{
UInt32 getDeltaTypeByteSize(UInt8 data_bytes_size)
Int64 getMaxValueForByteSize(UInt8 byte_size)
{
// both delta and double delta can be twice the size of data item, but not less than 32 bits and not more that 64.
return std::min(64/8, std::max(32/8, data_bytes_size * 2));
}
UInt32 getCompressedHeaderSize(UInt8 data_bytes_size)
{
const UInt8 items_count_size = 4;
return items_count_size + data_bytes_size + getDeltaTypeByteSize(data_bytes_size);
}
UInt32 getCompressedDataSize(UInt8 data_bytes_size, UInt32 uncompressed_size)
{
const UInt32 items_count = uncompressed_size / data_bytes_size;
// 11111 + max 64 bits of double delta.
const UInt32 max_item_size_bits = 5 + getDeltaTypeByteSize(data_bytes_size) * 8;
// + 8 is to round up to next byte.
return (items_count * max_item_size_bits + 8) / 8;
switch (byte_size)
{
case sizeof(UInt8):
return std::numeric_limits<Int8>::max();
case sizeof(UInt16):
return std::numeric_limits<Int16>::max();
case sizeof(UInt32):
return std::numeric_limits<Int32>::max();
case sizeof(UInt64):
return std::numeric_limits<Int64>::max();
default:
assert(false && "only 1, 2, 4 and 8 data sizes are supported");
}
__builtin_unreachable();
}
struct WriteSpec
@ -55,8 +51,10 @@ struct WriteSpec
const UInt8 data_bits;
};
const std::array<UInt8, 5> DELTA_SIZES{7, 9, 12, 32, 64};
template <typename T>
WriteSpec getWriteSpec(const T & value)
WriteSpec getDeltaWriteSpec(const T & value)
{
if (value > -63 && value < 64)
{
@ -80,27 +78,60 @@ WriteSpec getWriteSpec(const T & value)
}
}
template <typename T, typename DeltaType>
WriteSpec getDeltaMaxWriteSpecByteSize(UInt8 data_bytes_size)
{
return getDeltaWriteSpec(getMaxValueForByteSize(data_bytes_size));
}
UInt32 getCompressedHeaderSize(UInt8 data_bytes_size)
{
const UInt8 items_count_size = 4;
const UInt8 first_delta_bytes_size = data_bytes_size;
return items_count_size + data_bytes_size + first_delta_bytes_size;
}
UInt32 getCompressedDataSize(UInt8 data_bytes_size, UInt32 uncompressed_size)
{
const UInt32 items_count = uncompressed_size / data_bytes_size;
const auto double_delta_write_spec = getDeltaMaxWriteSpecByteSize(data_bytes_size);
const UInt32 max_item_size_bits = double_delta_write_spec.prefix_bits + double_delta_write_spec.data_bits;
// + 8 is to round up to next byte.
auto result = (items_count * max_item_size_bits + 7) / 8;
return result;
}
template <typename ValueType>
UInt32 compressDataForType(const char * source, UInt32 source_size, char * dest)
{
static_assert(std::is_unsigned_v<T> && std::is_signed_v<DeltaType>, "T must be unsigned, while DeltaType must be signed integer type.");
using UnsignedDeltaType = typename std::make_unsigned<DeltaType>::type;
// Since only unsinged int has granted 2-compliment overflow handling, we are doing math here on unsigned types.
// To simplify and booletproof code, we operate enforce ValueType to be unsigned too.
static_assert(std::is_unsigned_v<ValueType>, "ValueType must be unsigned.");
using UnsignedDeltaType = ValueType;
if (source_size % sizeof(T) != 0)
throw Exception("Cannot compress, data size " + toString(source_size) + " is not aligned to " + toString(sizeof(T)), ErrorCodes::CANNOT_COMPRESS);
// We use signed delta type to turn huge unsigned values into smaller signed:
// ffffffff => -1
using SignedDeltaType = typename std::make_signed<UnsignedDeltaType>::type;
if (source_size % sizeof(ValueType) != 0)
throw Exception("Cannot compress, data size " + toString(source_size)
+ " is not aligned to " + toString(sizeof(ValueType)), ErrorCodes::CANNOT_COMPRESS);
const char * source_end = source + source_size;
const UInt32 items_count = source_size / sizeof(T);
const UInt32 items_count = source_size / sizeof(ValueType);
unalignedStore<UInt32>(dest, items_count);
dest += sizeof(items_count);
T prev_value{};
DeltaType prev_delta{};
ValueType prev_value{};
UnsignedDeltaType prev_delta{};
if (source < source_end)
{
prev_value = unalignedLoad<T>(source);
unalignedStore<T>(dest, prev_value);
prev_value = unalignedLoad<ValueType>(source);
unalignedStore<ValueType>(dest, prev_value);
source += sizeof(prev_value);
dest += sizeof(prev_value);
@ -108,24 +139,26 @@ UInt32 compressDataForType(const char * source, UInt32 source_size, char * dest)
if (source < source_end)
{
const T curr_value = unalignedLoad<T>(source);
prev_delta = static_cast<DeltaType>(curr_value - prev_value);
unalignedStore<DeltaType>(dest, prev_delta);
const ValueType curr_value = unalignedLoad<ValueType>(source);
prev_delta = curr_value - prev_value;
unalignedStore<UnsignedDeltaType>(dest, prev_delta);
source += sizeof(curr_value);
dest += sizeof(prev_delta);
prev_value = curr_value;
}
WriteBuffer buffer(dest, getCompressedDataSize(sizeof(T), source_size - sizeof(T)*2));
WriteBuffer buffer(dest, getCompressedDataSize(sizeof(ValueType), source_size - sizeof(ValueType)*2));
BitWriter writer(buffer);
for (; source < source_end; source += sizeof(T))
int item = 2;
for (; source < source_end; source += sizeof(ValueType), ++item)
{
const T curr_value = unalignedLoad<T>(source);
const ValueType curr_value = unalignedLoad<ValueType>(source);
const DeltaType delta = static_cast<DeltaType>(curr_value - prev_value);
const DeltaType double_delta = delta - prev_delta;
const UnsignedDeltaType delta = curr_value - prev_value;
const UnsignedDeltaType double_delta = delta - prev_delta;
prev_delta = delta;
prev_value = curr_value;
@ -136,9 +169,11 @@ UInt32 compressDataForType(const char * source, UInt32 source_size, char * dest)
}
else
{
const auto sign = std::signbit(double_delta);
const auto abs_value = static_cast<UnsignedDeltaType>(std::abs(double_delta));
const auto write_spec = getWriteSpec(double_delta);
const SignedDeltaType signed_dd = static_cast<SignedDeltaType>(double_delta);
const auto sign = std::signbit(signed_dd);
// -1 shirnks dd down to fit into number of bits, and there can't be 0, so it is OK.
const auto abs_value = static_cast<UnsignedDeltaType>(std::abs(signed_dd) - 1);
const auto write_spec = getDeltaWriteSpec(signed_dd);
writer.writeBits(write_spec.prefix_bits, write_spec.prefix);
writer.writeBits(1, sign);
@ -151,22 +186,25 @@ UInt32 compressDataForType(const char * source, UInt32 source_size, char * dest)
return sizeof(items_count) + sizeof(prev_value) + sizeof(prev_delta) + buffer.count();
}
template <typename T, typename DeltaType>
template <typename ValueType>
void decompressDataForType(const char * source, UInt32 source_size, char * dest)
{
static_assert(std::is_unsigned_v<T> && std::is_signed_v<DeltaType>, "T must be unsigned, while DeltaType must be signed integer type.");
static_assert(std::is_unsigned_v<ValueType>, "ValueType must be unsigned.");
using UnsignedDeltaType = ValueType;
using SignedDeltaType = typename std::make_signed<UnsignedDeltaType>::type;
const char * source_end = source + source_size;
const UInt32 items_count = unalignedLoad<UInt32>(source);
source += sizeof(items_count);
T prev_value{};
DeltaType prev_delta{};
ValueType prev_value{};
UnsignedDeltaType prev_delta{};
if (source < source_end)
{
prev_value = unalignedLoad<T>(source);
unalignedStore<T>(dest, prev_value);
prev_value = unalignedLoad<ValueType>(source);
unalignedStore<ValueType>(dest, prev_value);
source += sizeof(prev_value);
dest += sizeof(prev_value);
@ -174,9 +212,9 @@ void decompressDataForType(const char * source, UInt32 source_size, char * dest)
if (source < source_end)
{
prev_delta = unalignedLoad<DeltaType>(source);
prev_value = prev_value + static_cast<T>(prev_delta);
unalignedStore<T>(dest, prev_value);
prev_delta = unalignedLoad<UnsignedDeltaType>(source);
prev_value = prev_value + static_cast<ValueType>(prev_delta);
unalignedStore<ValueType>(dest, prev_value);
source += sizeof(prev_delta);
dest += sizeof(prev_value);
@ -189,32 +227,35 @@ void decompressDataForType(const char * source, UInt32 source_size, char * dest)
// we have to keep track of items to avoid reading more that there is.
for (UInt32 items_read = 2; items_read < items_count && !reader.eof(); ++items_read)
{
DeltaType double_delta = 0;
UnsignedDeltaType double_delta = 0;
if (reader.readBit() == 1)
{
const UInt8 data_sizes[] = {6, 8, 11, 31, 63};
UInt8 i = 0;
for (; i < sizeof(data_sizes) - 1; ++i)
for (; i < sizeof(DELTA_SIZES) - 1; ++i)
{
const auto next_bit = reader.readBit();
if (next_bit == 0)
{
break;
}
}
const UInt8 sign = reader.readBit();
double_delta = static_cast<DeltaType>(reader.readBits(data_sizes[i]));
SignedDeltaType signed_dd = static_cast<SignedDeltaType>(reader.readBits(DELTA_SIZES[i] - 1) + 1);
if (sign)
{
double_delta *= -1;
signed_dd *= -1;
}
double_delta = static_cast<UnsignedDeltaType>(signed_dd);
}
// else if first bit is zero, no need to read more data.
const T curr_value = prev_value + static_cast<T>(prev_delta + double_delta);
unalignedStore<T>(dest, curr_value);
const UnsignedDeltaType delta = double_delta + prev_delta;
const ValueType curr_value = prev_value + delta;
unalignedStore<ValueType>(dest, curr_value);
dest += sizeof(curr_value);
prev_delta = static_cast<DeltaType>(curr_value - prev_value);
prev_delta = curr_value - prev_value;
prev_value = curr_value;
}
}
@ -267,19 +308,20 @@ UInt32 CompressionCodecDoubleDelta::doCompressData(const char * source, UInt32 s
memcpy(&dest[2], source, bytes_to_skip);
size_t start_pos = 2 + bytes_to_skip;
UInt32 compressed_size = 0;
switch (data_bytes_size)
{
case 1:
compressed_size = compressDataForType<UInt8, Int16>(&source[bytes_to_skip], source_size - bytes_to_skip, &dest[start_pos]);
compressed_size = compressDataForType<UInt8>(&source[bytes_to_skip], source_size - bytes_to_skip, &dest[start_pos]);
break;
case 2:
compressed_size = compressDataForType<UInt16, Int32>(&source[bytes_to_skip], source_size - bytes_to_skip, &dest[start_pos]);
compressed_size = compressDataForType<UInt16>(&source[bytes_to_skip], source_size - bytes_to_skip, &dest[start_pos]);
break;
case 4:
compressed_size = compressDataForType<UInt32, Int64>(&source[bytes_to_skip], source_size - bytes_to_skip, &dest[start_pos]);
compressed_size = compressDataForType<UInt32>(&source[bytes_to_skip], source_size - bytes_to_skip, &dest[start_pos]);
break;
case 8:
compressed_size = compressDataForType<UInt64, Int64>(&source[bytes_to_skip], source_size - bytes_to_skip, &dest[start_pos]);
compressed_size = compressDataForType<UInt64>(&source[bytes_to_skip], source_size - bytes_to_skip, &dest[start_pos]);
break;
}
@ -296,16 +338,16 @@ void CompressionCodecDoubleDelta::doDecompressData(const char * source, UInt32 s
switch (bytes_size)
{
case 1:
decompressDataForType<UInt8, Int16>(&source[2 + bytes_to_skip], source_size_no_header, &dest[bytes_to_skip]);
decompressDataForType<UInt8>(&source[2 + bytes_to_skip], source_size_no_header, &dest[bytes_to_skip]);
break;
case 2:
decompressDataForType<UInt16, Int32>(&source[2 + bytes_to_skip], source_size_no_header, &dest[bytes_to_skip]);
decompressDataForType<UInt16>(&source[2 + bytes_to_skip], source_size_no_header, &dest[bytes_to_skip]);
break;
case 4:
decompressDataForType<UInt32, Int64>(&source[2 + bytes_to_skip], source_size_no_header, &dest[bytes_to_skip]);
decompressDataForType<UInt32>(&source[2 + bytes_to_skip], source_size_no_header, &dest[bytes_to_skip]);
break;
case 8:
decompressDataForType<UInt64, Int64>(&source[2 + bytes_to_skip], source_size_no_header, &dest[bytes_to_skip]);
decompressDataForType<UInt64>(&source[2 + bytes_to_skip], source_size_no_header, &dest[bytes_to_skip]);
break;
}
}

View File

@ -8,16 +8,16 @@
#include <boost/format.hpp>
#include <bitset>
#include <cmath>
#include <initializer_list>
#include <iomanip>
#include <memory>
#include <vector>
#include <typeinfo>
#include <iterator>
#include <optional>
#include <iostream>
#include <bitset>
#include <iterator>
#include <memory>
#include <typeinfo>
#include <vector>
#include <string.h>
#include <gtest/gtest.h>
@ -114,36 +114,71 @@ template <typename T, typename ContainerLeft, typename ContainerRight>
result = ::testing::AssertionFailure();
}
result << "mismatching " << sizeof(T) << "-byte item #" << i
<< "\nexpected: " << bin(left_value)
<< "\ngot : " << bin(right_value)
<< std::endl;
if (++mismatching_items >= MAX_MISMATCHING_ITEMS)
if (++mismatching_items <= MAX_MISMATCHING_ITEMS)
{
result << "..." << std::endl;
break;
result << "mismatching " << sizeof(T) << "-byte item #" << i
<< "\nexpected: " << bin(left_value) << " (0x" << std::hex << left_value << ")"
<< "\ngot : " << bin(right_value) << " (0x" << std::hex << right_value << ")"
<< std::endl;
if (mismatching_items == MAX_MISMATCHING_ITEMS)
{
result << "..." << std::endl;
}
}
}
}
if (mismatching_items > 0)
{
result << "\ntotal mismatching items:" << mismatching_items << " of " << size;
}
return result;
}
struct CodecTestParam
{
std::string type_name;
std::vector<char> source_data;
UInt8 data_byte_size;
double min_compression_ratio;
std::string case_name;
// to allow setting ratio after building with complex builder functions.
CodecTestParam && setRatio(const double & ratio) &&
{
this->min_compression_ratio = ratio;
return std::move(*this);
}
};
CodecTestParam operator+(CodecTestParam && left, CodecTestParam && right)
{
assert(left.type_name == right.type_name);
assert(left.data_byte_size == right.data_byte_size);
std::vector data(std::move(left.source_data));
data.insert(data.end(), right.source_data.begin(), right.source_data.end());
return CodecTestParam{
left.type_name,
std::move(data),
left.data_byte_size,
std::min(left.min_compression_ratio, right.min_compression_ratio),
left.case_name + " + " + right.case_name
};
}
std::ostream & operator<<(std::ostream & ostr, const CodecTestParam & param)
{
return ostr << "name: " << param.case_name
<< "\ntype name:" << param.type_name
<< "\nbyte size: " << static_cast<UInt32>(param.data_byte_size)
<< "\ndata size: " << param.source_data.size();
}
// compression ratio < 1.0 means that codec output is smaller than input.
const double DEFAULT_MIN_COMPRESSION_RATIO = 1.0;
template <typename T, typename... Args>
CodecTestParam makeParam(Args && ... args)
{
@ -157,11 +192,11 @@ CodecTestParam makeParam(Args && ... args)
write_pos += sizeof(v);
}
return CodecTestParam{std::move(data), sizeof(T),
(boost::format("%1% %2%") % (sizeof(T) * std::size(vals)) % " predefined values").str()};
return CodecTestParam{type_name<T>(), std::move(data), sizeof(T), DEFAULT_MIN_COMPRESSION_RATIO,
(boost::format("%1% values of %2%") % std::size(vals) % type_name<T>()).str()};
}
template <typename T, size_t Begin = 1, size_t End = 10000, typename Generator>
template <typename T, size_t Begin = 1, size_t End = 10001, typename Generator>
CodecTestParam generateParam(Generator gen, const char* gen_name)
{
static_assert (End >= Begin, "End must be not less than Begin");
@ -176,8 +211,8 @@ CodecTestParam generateParam(Generator gen, const char* gen_name)
write_pos += sizeof(v);
}
return CodecTestParam{std::move(data), sizeof(T),
(boost::format("%1% from %2% (%3% => %4%)") % type_name<T>() % gen_name % Begin % End).str()};
return CodecTestParam{type_name<T>(), std::move(data), sizeof(T), DEFAULT_MIN_COMPRESSION_RATIO,
(boost::format("%1% values of %2% from %3%") % (End - Begin) % type_name<T>() % gen_name).str()};
}
void TestTranscoding(ICompressionCodec * codec, const CodecTestParam & param)
@ -211,6 +246,13 @@ void TestTranscoding(ICompressionCodec * codec, const CodecTestParam & param)
default:
FAIL() << "Invalid data_byte_size: " << param.data_byte_size;
}
const auto header_size = codec->getHeaderSize();
const auto compression_ratio = (encoded_size - header_size) / (source_data.size() * 1.0);
ASSERT_LE(compression_ratio, param.min_compression_ratio)
<< "\n\tdecoded size: " << source_data.size()
<< "\n\tencoded size: " << encoded_size
<< "(no header: " << encoded_size - header_size << ")";
}
class CodecTest : public ::testing::TestWithParam<CodecTestParam>
@ -225,20 +267,34 @@ public:
TEST_P(CodecTest, DoubleDelta)
{
const auto & param = GetParam();
auto param = GetParam();
auto codec = std::make_unique<CompressionCodecDoubleDelta>(param.data_byte_size);
if (param.type_name == type_name<Float32>() || param.type_name == type_name<Float64>())
{
// dd doesn't work great with many cases of integers and may result in very poor compression rate.
param.min_compression_ratio *= 1.5;
}
TestTranscoding(codec.get(), param);
}
TEST_P(CodecTest, Gorilla)
{
const auto & param = GetParam();
auto param = GetParam();
auto codec = std::make_unique<CompressionCodecGorilla>(param.data_byte_size);
if (param.type_name == type_name<UInt32>() || param.type_name == type_name<Int32>()
|| param.type_name == type_name<UInt64>() || param.type_name == type_name<Int64>())
{
// gorilla doesn't work great with many cases of integers and may result in very poor compression rate.
param.min_compression_ratio *= 1.5;
}
TestTranscoding(codec.get(), param);
}
// Here we use generators to produce test payload for codecs.
// Generator is a callable that should produce output value of the same type as input value.
auto SameValueGenerator = [](auto value)
{
return [=](auto i)
@ -256,30 +312,44 @@ auto SequentialGenerator = [](auto stride = 1)
};
};
// Generator that helps debugging output of other generators
// by logging every output value alongside iteration index and input.
//auto LoggingProxyGenerator = [](auto other_generator, const char * name, std::ostream & ostr, const int limit = std::numeric_limits<int>::max())
//{
// ostr << "\n\nValues from " << name << ":\n";
// auto count = std::make_shared<int>(0);
// return [&, count](auto i)
// {
// using ValueType = decltype(i);
// const auto ret = static_cast<ValueType>(other_generator(i));
// if (++(*count) < limit)
// {
// ostr << "\t" << *count << " : " << i << " => " << ret << "\n";
// }
// return ret;
// };
//};
template <typename T>
struct MonotonicGenerator
{
MonotonicGenerator(T stride = 1, size_t max_step = 10)
: prev_value{},
: prev_value(0),
stride(stride),
max_step(max_step)
{}
template <typename U>
U operator()(U i)
U operator()(U)
{
if (!prev_value.has_value())
{
prev_value = i * stride;
}
const U result = *prev_value + static_cast<T>(stride * (rand() % max_step));
const U result = prev_value + static_cast<T>(stride * (rand() % max_step));
prev_value = result;
return result;
}
std::optional<T> prev_value;
T prev_value;
const T stride;
const size_t max_step;
};
@ -296,25 +366,45 @@ auto MinMaxGenerator = [](auto i)
}
};
auto RandomGenerator = [](auto i) {return static_cast<decltype(i)>(rand());};
template <typename T>
struct RandomGenerator
{
RandomGenerator(T seed = 0, T value_cap = std::numeric_limits<T>::max())
: e(seed),
value_cap(value_cap)
{
}
template <typename U>
U operator()(U i)
{
return static_cast<decltype(i)>(distribution(e) % value_cap);
}
private:
std::default_random_engine e;
std::uniform_int_distribution<T> distribution;
const T value_cap;
};
auto RandomishGenerator = [](auto i)
{
return static_cast<decltype(i)>(sin(static_cast<double>(i) * i) * i);
};
INSTANTIATE_TEST_CASE_P(Basic,
// helper macro to produce human-friendly test case name
#define G(generator) generator, #generator
INSTANTIATE_TEST_CASE_P(Mixed,
CodecTest,
::testing::Values(
makeParam<UInt32>(1, 2, 3, 4),
makeParam<UInt64>(1, 2, 3, 4),
makeParam<Float32>(1.1, 2.2, 3.3, 4.4),
makeParam<Float64>(1.1, 2.2, 3.3, 4.4)
generateParam<Int32, 1, 3>(G(MinMaxGenerator)) + generateParam<Int32, 1, 11>(G(SequentialGenerator(1))).setRatio(1),
generateParam<UInt32, 1, 3>(G(MinMaxGenerator)) + generateParam<UInt32, 1, 11>(G(SequentialGenerator(1))).setRatio(1),
generateParam<Int64, 1, 3>(G(MinMaxGenerator)) + generateParam<Int64, 1, 11>(G(SequentialGenerator(1))).setRatio(1),
generateParam<UInt64, 1, 3>(G(MinMaxGenerator)) + generateParam<UInt64, 1, 11>(G(SequentialGenerator(1))).setRatio(1)
),
);
#define G(generator) generator, #generator
INSTANTIATE_TEST_CASE_P(Same,
CodecTest,
::testing::Values(
@ -354,18 +444,20 @@ INSTANTIATE_TEST_CASE_P(Monotonic,
INSTANTIATE_TEST_CASE_P(Random,
CodecTest,
::testing::Values(
generateParam<UInt32>(G(RandomGenerator)),
generateParam<UInt64>(G(RandomGenerator))
generateParam<UInt32>(G(RandomGenerator<UInt32>(0, 1000'000'000))).setRatio(1.2),
generateParam<UInt64>(G(RandomGenerator<UInt64>(0, 1000'000'000))).setRatio(1.1)
),
);
INSTANTIATE_TEST_CASE_P(RandomLike,
INSTANTIATE_TEST_CASE_P(Randomish,
CodecTest,
::testing::Values(
generateParam<Int32>(G(RandomishGenerator)),
generateParam<Int64>(G(RandomishGenerator)),
generateParam<Float32>(G(RandomishGenerator)),
generateParam<Float64>(G(RandomishGenerator))
generateParam<Int32>(G(RandomishGenerator)).setRatio(1.1),
generateParam<Int64>(G(RandomishGenerator)).setRatio(1.1),
generateParam<UInt32>(G(RandomishGenerator)).setRatio(1.1),
generateParam<UInt64>(G(RandomishGenerator)).setRatio(1.1),
generateParam<Float32>(G(RandomishGenerator)).setRatio(1.1),
generateParam<Float64>(G(RandomishGenerator)).setRatio(1.1)
),
);

View File

@ -56,7 +56,7 @@
#define DBMS_MIN_REVISION_WITH_LOW_CARDINALITY_TYPE 54405
#define DBMS_MIN_REVISION_WITH_CLIENT_WRITE_INFO 54421
#define DBMS_MIN_REVISION_WITH_CLIENT_WRITE_INFO 54420
/// Version of ClickHouse TCP protocol. Set to git tag with latest protocol change.
#define DBMS_TCP_PROTOCOL_VERSION 54226

View File

@ -1,6 +1,7 @@
#include <Functions/FunctionFactory.h>
#include <Functions/FunctionBinaryArithmetic.h>
#include <boost/integer/common_factor.hpp>
#include <numeric>
namespace DB
{
@ -15,7 +16,7 @@ struct GCDImpl
{
throwIfDivisionLeadsToFPE(typename NumberTraits::ToInteger<A>::Type(a), typename NumberTraits::ToInteger<B>::Type(b));
throwIfDivisionLeadsToFPE(typename NumberTraits::ToInteger<B>::Type(b), typename NumberTraits::ToInteger<A>::Type(a));
return boost::integer::gcd(
return std::gcd(
typename NumberTraits::ToInteger<Result>::Type(a),
typename NumberTraits::ToInteger<Result>::Type(b));
}

View File

@ -1,6 +1,7 @@
#include <Functions/FunctionFactory.h>
#include <Functions/FunctionBinaryArithmetic.h>
#include <boost/integer/common_factor.hpp>
#include <numeric>
namespace DB
{
@ -15,7 +16,7 @@ struct LCMImpl
{
throwIfDivisionLeadsToFPE(typename NumberTraits::ToInteger<A>::Type(a), typename NumberTraits::ToInteger<B>::Type(b));
throwIfDivisionLeadsToFPE(typename NumberTraits::ToInteger<B>::Type(b), typename NumberTraits::ToInteger<A>::Type(a));
return boost::integer::lcm(
return std::lcm(
typename NumberTraits::ToInteger<Result>::Type(a),
typename NumberTraits::ToInteger<Result>::Type(b));
}

View File

@ -150,7 +150,6 @@ public:
const UInt64 mask = maskLowBits<UInt64>(to_write);
v &= mask;
// assert(v <= 255);
bits_buffer <<= to_write;
bits_buffer |= v;

View File

@ -29,11 +29,10 @@ namespace ErrorCodes
void ThreadStatus::attachQueryContext(Context & query_context_)
{
query_context = &query_context_;
query_id = query_context->getCurrentQueryId();
if (!global_context)
global_context = &query_context->getGlobalContext();
query_id = query_context->getCurrentQueryId();
if (thread_group)
{
std::lock_guard lock(thread_group->mutex);
@ -106,6 +105,9 @@ void ThreadStatus::attachQuery(const ThreadGroupStatusPtr & thread_group_, bool
thread_group->thread_numbers.emplace_back(thread_number);
}
if (query_context)
query_id = query_context->getCurrentQueryId();
#if defined(__linux__)
/// Set "nice" value if required.
if (query_context)
@ -269,7 +271,7 @@ void CurrentThread::attachQueryContext(Context & query_context)
{
if (unlikely(!current_thread))
return;
return current_thread->attachQueryContext(query_context);
current_thread->attachQueryContext(query_context);
}
void CurrentThread::finalizePerformanceCounters()

View File

@ -178,6 +178,24 @@ const KeyCondition::AtomMap KeyCondition::atom_map
return true;
}
},
{
"empty",
[] (RPNElement & out, const Field &)
{
out.function = RPNElement::FUNCTION_IN_RANGE;
out.range = Range("");
return true;
}
},
{
"notEmpty",
[] (RPNElement & out, const Field &)
{
out.function = RPNElement::FUNCTION_NOT_IN_RANGE;
out.range = Range("");
return true;
}
},
{
"like",
[] (RPNElement & out, const Field & value)
@ -199,6 +217,27 @@ const KeyCondition::AtomMap KeyCondition::atom_map
return true;
}
},
{
"notLike",
[] (RPNElement & out, const Field & value)
{
if (value.getType() != Field::Types::String)
return false;
String prefix = extractFixedPrefixFromLikePattern(value.get<const String &>());
if (prefix.empty())
return false;
String right_bound = firstStringThatIsGreaterThanAllStringsWithPrefix(prefix);
out.function = RPNElement::FUNCTION_NOT_IN_RANGE;
out.range = !right_bound.empty()
? Range(prefix, true, right_bound, false)
: Range::createLeftBounded(prefix, true);
return true;
}
},
{
"startsWith",
[] (RPNElement & out, const Field & value)
@ -645,92 +684,102 @@ bool KeyCondition::atomFromAST(const ASTPtr & node, const Context & context, Blo
{
const ASTs & args = func->arguments->children;
if (args.size() != 2)
return false;
DataTypePtr key_expr_type; /// Type of expression containing key column
size_t key_arg_pos; /// Position of argument with key column (non-const argument)
size_t key_column_num = -1; /// Number of a key column (inside key_column_names array)
MonotonicFunctionsChain chain;
bool is_set_const = false;
bool is_constant_transformed = false;
std::string func_name = func->name;
if (functionIsInOrGlobalInOperator(func->name)
&& tryPrepareSetIndex(args, context, out, key_column_num))
if (args.size() == 1)
{
key_arg_pos = 0;
is_set_const = true;
if (!(isKeyPossiblyWrappedByMonotonicFunctions(args[0], context, key_column_num, key_expr_type, chain)))
return false;
if (key_column_num == static_cast<size_t>(-1))
throw Exception("`key_column_num` wasn't initialized. It is a bug.", ErrorCodes::LOGICAL_ERROR);
}
else if (getConstant(args[1], block_with_constants, const_value, const_type)
&& isKeyPossiblyWrappedByMonotonicFunctions(args[0], context, key_column_num, key_expr_type, chain))
else if (args.size() == 2)
{
key_arg_pos = 0;
}
else if (getConstant(args[1], block_with_constants, const_value, const_type)
&& canConstantBeWrappedByMonotonicFunctions(args[0], key_column_num, key_expr_type, const_value, const_type))
{
key_arg_pos = 0;
is_constant_transformed = true;
}
else if (getConstant(args[0], block_with_constants, const_value, const_type)
&& isKeyPossiblyWrappedByMonotonicFunctions(args[1], context, key_column_num, key_expr_type, chain))
{
key_arg_pos = 1;
}
else if (getConstant(args[0], block_with_constants, const_value, const_type)
&& canConstantBeWrappedByMonotonicFunctions(args[1], key_column_num, key_expr_type, const_value, const_type))
{
key_arg_pos = 1;
is_constant_transformed = true;
size_t key_arg_pos; /// Position of argument with key column (non-const argument)
bool is_set_const = false;
bool is_constant_transformed = false;
if (functionIsInOrGlobalInOperator(func_name)
&& tryPrepareSetIndex(args, context, out, key_column_num))
{
key_arg_pos = 0;
is_set_const = true;
}
else if (getConstant(args[1], block_with_constants, const_value, const_type)
&& isKeyPossiblyWrappedByMonotonicFunctions(args[0], context, key_column_num, key_expr_type, chain))
{
key_arg_pos = 0;
}
else if (getConstant(args[1], block_with_constants, const_value, const_type)
&& canConstantBeWrappedByMonotonicFunctions(args[0], key_column_num, key_expr_type, const_value, const_type))
{
key_arg_pos = 0;
is_constant_transformed = true;
}
else if (getConstant(args[0], block_with_constants, const_value, const_type)
&& isKeyPossiblyWrappedByMonotonicFunctions(args[1], context, key_column_num, key_expr_type, chain))
{
key_arg_pos = 1;
}
else if (getConstant(args[0], block_with_constants, const_value, const_type)
&& canConstantBeWrappedByMonotonicFunctions(args[1], key_column_num, key_expr_type, const_value, const_type))
{
key_arg_pos = 1;
is_constant_transformed = true;
}
else
return false;
if (key_column_num == static_cast<size_t>(-1))
throw Exception("`key_column_num` wasn't initialized. It is a bug.", ErrorCodes::LOGICAL_ERROR);
/// Transformed constant must weaken the condition, for example "x > 5" must weaken to "round(x) >= 5"
if (is_constant_transformed)
{
if (func_name == "less")
func_name = "lessOrEquals";
else if (func_name == "greater")
func_name = "greaterOrEquals";
}
/// Replace <const> <sign> <data> on to <data> <-sign> <const>
if (key_arg_pos == 1)
{
if (func_name == "less")
func_name = "greater";
else if (func_name == "greater")
func_name = "less";
else if (func_name == "greaterOrEquals")
func_name = "lessOrEquals";
else if (func_name == "lessOrEquals")
func_name = "greaterOrEquals";
else if (func_name == "in" || func_name == "notIn" || func_name == "like")
{
/// "const IN data_column" doesn't make sense (unlike "data_column IN const")
return false;
}
}
bool cast_not_needed =
is_set_const /// Set args are already casted inside Set::createFromAST
|| (isNativeNumber(key_expr_type) && isNativeNumber(const_type)); /// Numbers are accurately compared without cast.
if (!cast_not_needed)
castValueToType(key_expr_type, const_value, const_type, node);
}
else
return false;
if (key_column_num == static_cast<size_t>(-1))
throw Exception("`key_column_num` wasn't initialized. It is a bug.", ErrorCodes::LOGICAL_ERROR);
std::string func_name = func->name;
/// Transformed constant must weaken the condition, for example "x > 5" must weaken to "round(x) >= 5"
if (is_constant_transformed)
{
if (func_name == "less")
func_name = "lessOrEquals";
else if (func_name == "greater")
func_name = "greaterOrEquals";
}
/// Replace <const> <sign> <data> on to <data> <-sign> <const>
if (key_arg_pos == 1)
{
if (func_name == "less")
func_name = "greater";
else if (func_name == "greater")
func_name = "less";
else if (func_name == "greaterOrEquals")
func_name = "lessOrEquals";
else if (func_name == "lessOrEquals")
func_name = "greaterOrEquals";
else if (func_name == "in" || func_name == "notIn" || func_name == "like")
{
/// "const IN data_column" doesn't make sense (unlike "data_column IN const")
return false;
}
}
out.key_column = key_column_num;
out.monotonic_functions_chain = std::move(chain);
const auto atom_it = atom_map.find(func_name);
if (atom_it == std::end(atom_map))
return false;
bool cast_not_needed =
is_set_const /// Set args are already casted inside Set::createFromAST
|| (isNativeNumber(key_expr_type) && isNativeNumber(const_type)); /// Numbers are accurately compared without cast.
if (!cast_not_needed)
castValueToType(key_expr_type, const_value, const_type, node);
out.key_column = key_column_num;
out.monotonic_functions_chain = std::move(chain);
return atom_it->second(out, const_value);
}
@ -748,7 +797,6 @@ bool KeyCondition::atomFromAST(const ASTPtr & node, const Context & context, Blo
return true;
}
}
return false;
}

View File

@ -142,7 +142,7 @@ const MergeTreeConditionFullText::AtomMap MergeTreeConditionFullText::atom_map
"like",
[] (RPNElement & out, const Field & value, const MergeTreeIndexFullText & idx)
{
out.function = RPNElement::FUNCTION_LIKE;
out.function = RPNElement::FUNCTION_EQUALS;
out.bloom_filter = std::make_unique<BloomFilter>(
idx.bloom_filter_size, idx.bloom_filter_hashes, idx.seed);
@ -151,6 +151,66 @@ const MergeTreeConditionFullText::AtomMap MergeTreeConditionFullText::atom_map
return true;
}
},
{
"notLike",
[] (RPNElement & out, const Field & value, const MergeTreeIndexFullText & idx)
{
out.function = RPNElement::FUNCTION_NOT_EQUALS;
out.bloom_filter = std::make_unique<BloomFilter>(
idx.bloom_filter_size, idx.bloom_filter_hashes, idx.seed);
const auto & str = value.get<String>();
likeStringToBloomFilter(str, idx.token_extractor_func, *out.bloom_filter);
return true;
}
},
{
"startsWith",
[] (RPNElement & out, const Field & value, const MergeTreeIndexFullText & idx)
{
out.function = RPNElement::FUNCTION_EQUALS;
out.bloom_filter = std::make_unique<BloomFilter>(
idx.bloom_filter_size, idx.bloom_filter_hashes, idx.seed);
const auto & prefix = value.get<String>();
stringToBloomFilter(prefix.c_str(), prefix.size(), idx.token_extractor_func, *out.bloom_filter);
return true;
}
},
{
"endsWith",
[] (RPNElement & out, const Field & value, const MergeTreeIndexFullText & idx)
{
out.function = RPNElement::FUNCTION_EQUALS;
out.bloom_filter = std::make_unique<BloomFilter>(
idx.bloom_filter_size, idx.bloom_filter_hashes, idx.seed);
const auto & suffix = value.get<String>();
stringToBloomFilter(suffix.c_str(), suffix.size(), idx.token_extractor_func, *out.bloom_filter);
return true;
}
},
{
"multiSearchAny",
[] (RPNElement & out, const Field & value, const MergeTreeIndexFullText & idx)
{
out.function = RPNElement::FUNCTION_MULTI_SEARCH;
std::vector<std::vector<BloomFilter>> bloom_filters;
bloom_filters.emplace_back();
for (const auto & element : value.get<Array>())
{
if (element.getType() != Field::Types::String)
return false;
bloom_filters.back().emplace_back(idx.bloom_filter_size, idx.bloom_filter_hashes, idx.seed);
const auto & str = element.get<String>();
stringToBloomFilter(str.c_str(), str.size(), idx.token_extractor_func, bloom_filters.back().back());
}
out.set_bloom_filters = std::move(bloom_filters);
return true;
}
},
{
"notIn",
[] (RPNElement & out, const Field &, const MergeTreeIndexFullText &)
@ -197,10 +257,9 @@ bool MergeTreeConditionFullText::alwaysUnknownOrTrue() const
}
else if (element.function == RPNElement::FUNCTION_EQUALS
|| element.function == RPNElement::FUNCTION_NOT_EQUALS
|| element.function == RPNElement::FUNCTION_LIKE
|| element.function == RPNElement::FUNCTION_NOT_LIKE
|| element.function == RPNElement::FUNCTION_IN
|| element.function == RPNElement::FUNCTION_NOT_IN
|| element.function == RPNElement::FUNCTION_MULTI_SEARCH
|| element.function == RPNElement::ALWAYS_FALSE)
{
rpn_stack.push_back(false);
@ -255,17 +314,8 @@ bool MergeTreeConditionFullText::mayBeTrueOnGranule(MergeTreeIndexGranulePtr idx
if (element.function == RPNElement::FUNCTION_NOT_EQUALS)
rpn_stack.back() = !rpn_stack.back();
}
else if (element.function == RPNElement::FUNCTION_LIKE
|| element.function == RPNElement::FUNCTION_NOT_LIKE)
{
rpn_stack.emplace_back(
granule->bloom_filters[element.key_column].contains(*element.bloom_filter), true);
if (element.function == RPNElement::FUNCTION_NOT_LIKE)
rpn_stack.back() = !rpn_stack.back();
}
else if (element.function == RPNElement::FUNCTION_IN
|| element.function == RPNElement::FUNCTION_NOT_IN)
|| element.function == RPNElement::FUNCTION_NOT_IN)
{
std::vector<bool> result(element.set_bloom_filters.back().size(), true);
@ -283,6 +333,18 @@ bool MergeTreeConditionFullText::mayBeTrueOnGranule(MergeTreeIndexGranulePtr idx
if (element.function == RPNElement::FUNCTION_NOT_IN)
rpn_stack.back() = !rpn_stack.back();
}
else if (element.function == RPNElement::FUNCTION_MULTI_SEARCH)
{
std::vector<bool> result(element.set_bloom_filters.back().size(), true);
const auto & bloom_filters = element.set_bloom_filters[0];
for (size_t row = 0; row < bloom_filters.size(); ++row)
result[row] = result[row] && granule->bloom_filters[element.key_column].contains(bloom_filters[row]);
rpn_stack.emplace_back(
std::find(std::cbegin(result), std::cend(result), true) != std::end(result), true);
}
else if (element.function == RPNElement::FUNCTION_NOT)
{
rpn_stack.back() = !rpn_stack.back();
@ -343,8 +405,9 @@ bool MergeTreeConditionFullText::atomFromAST(
size_t key_arg_pos; /// Position of argument with key column (non-const argument)
size_t key_column_num = -1; /// Number of a key column (inside key_column_names array)
std::string func_name = func->name;
if (functionIsInOrGlobalInOperator(func->name) && tryPrepareSetBloomFilter(args, out))
if (functionIsInOrGlobalInOperator(func_name) && tryPrepareSetBloomFilter(args, out))
{
key_arg_pos = 0;
}
@ -359,17 +422,17 @@ bool MergeTreeConditionFullText::atomFromAST(
else
return false;
if (const_type && const_type->getTypeId() != TypeIndex::String && const_type->getTypeId() != TypeIndex::FixedString)
if (const_type && const_type->getTypeId() != TypeIndex::String
&& const_type->getTypeId() != TypeIndex::FixedString
&& const_type->getTypeId() != TypeIndex::Array)
return false;
if (key_arg_pos == 1 && (func->name != "equals" || func->name != "notEquals"))
if (key_arg_pos == 1 && (func_name != "equals" || func_name != "notEquals"))
return false;
else if (!index.token_extractor_func->supportLike() && (func->name == "like" || func->name == "notLike"))
else if (!index.token_extractor_func->supportLike() && (func_name == "like" || func_name == "notLike"))
return false;
else
key_arg_pos = 0;
const auto atom_it = atom_map.find(func->name);
const auto atom_it = atom_map.find(func_name);
if (atom_it == std::end(atom_map))
return false;
@ -380,8 +443,8 @@ bool MergeTreeConditionFullText::atomFromAST(
{
/// Check constant like in KeyCondition
if (const_value.getType() == Field::Types::UInt64
|| const_value.getType() == Field::Types::Int64
|| const_value.getType() == Field::Types::Float64)
|| const_value.getType() == Field::Types::Int64
|| const_value.getType() == Field::Types::Float64)
{
/// Zero in all types is represented in memory the same way as in UInt64.
out.function = const_value.get<UInt64>()
@ -475,7 +538,6 @@ bool MergeTreeConditionFullText::tryPrepareSetBloomFilter(
return true;
}
MergeTreeIndexGranulePtr MergeTreeIndexFullText::createIndexGranule() const
{
return std::make_shared<MergeTreeIndexGranuleFullText>(*this);

View File

@ -78,10 +78,9 @@ private:
/// Atoms of a Boolean expression.
FUNCTION_EQUALS,
FUNCTION_NOT_EQUALS,
FUNCTION_LIKE,
FUNCTION_NOT_LIKE,
FUNCTION_IN,
FUNCTION_NOT_IN,
FUNCTION_MULTI_SEARCH,
FUNCTION_UNKNOWN, /// Can take any value.
/// Operators of the logical expression.
FUNCTION_NOT,
@ -93,15 +92,20 @@ private:
};
RPNElement(
Function function_ = FUNCTION_UNKNOWN, size_t key_column_ = 0, std::unique_ptr<BloomFilter> && const_bloom_filter_ = nullptr)
: function(function_), key_column(key_column_), bloom_filter(std::move(const_bloom_filter_)) {}
Function function_ = FUNCTION_UNKNOWN, size_t key_column_ = 0, std::unique_ptr<BloomFilter> && const_bloom_filter_ = nullptr)
: function(function_), key_column(key_column_), bloom_filter(std::move(const_bloom_filter_)) {}
Function function = FUNCTION_UNKNOWN;
/// For FUNCTION_EQUALS, FUNCTION_NOT_EQUALS, FUNCTION_LIKE, FUNCTION_NOT_LIKE.
/// For FUNCTION_EQUALS, FUNCTION_NOT_EQUALS and FUNCTION_MULTI_SEARCH
size_t key_column;
/// For FUNCTION_EQUALS, FUNCTION_NOT_EQUALS
std::unique_ptr<BloomFilter> bloom_filter;
/// For FUNCTION_IN and FUNCTION_NOT_IN
/// For FUNCTION_IN, FUNCTION_NOT_IN and FUNCTION_MULTI_SEARCH
std::vector<std::vector<BloomFilter>> set_bloom_filters;
/// For FUNCTION_IN and FUNCTION_NOT_IN
std::vector<size_t> set_key_position;
};

View File

@ -411,7 +411,10 @@ static bool checkAtomName(const String & name)
"greaterOrEquals",
"in",
"notIn",
"like"
"like",
"startsWith",
"endsWith",
"multiSearchAny"
};
return atoms.find(name) != atoms.end();
}

View File

@ -46,13 +46,13 @@ MergeTreeSelectBlockInputStream::MergeTreeSelectBlockInputStream(
for (const auto & range : all_mark_ranges)
total_marks_count += range.end - range.begin;
size_t total_rows = data_part->index_granularity.getTotalRows();
size_t total_rows = data_part->index_granularity.getRowsCountInRanges(all_mark_ranges);
if (!quiet)
LOG_TRACE(log, "Reading " << all_mark_ranges.size() << " ranges from part " << data_part->name
<< ", approx. " << total_rows
<< (all_mark_ranges.size() > 1
? ", up to " + toString(data_part->index_granularity.getRowsCountInRanges(all_mark_ranges))
? ", up to " + toString(total_rows)
: "")
<< " rows starting from " << data_part->index_granularity.getMarkStartingRow(all_mark_ranges.front().begin));

View File

@ -0,0 +1,10 @@
<?xml version="1.0"?>
<!-- Config for test server -->
<yandex>
<query_masking_rules>
<rule>
<regexp>TOPSECRET.TOPSECRET</regexp>
<replace>[hidden]</replace>
</rule>
</query_masking_rules>
</yandex>

View File

@ -11,13 +11,14 @@ node18_14 = cluster.add_instance('node18_14', image='yandex/clickhouse-server:18
node19_1 = cluster.add_instance('node19_1', image='yandex/clickhouse-server:19.1.16', with_installed_binary=True)
node19_4 = cluster.add_instance('node19_4', image='yandex/clickhouse-server:19.4.5.35', with_installed_binary=True)
node19_6 = cluster.add_instance('node19_6', image='yandex/clickhouse-server:19.6.3.18', with_installed_binary=True)
node19_8 = cluster.add_instance('node19_8', image='yandex/clickhouse-server:19.8.3.8', with_installed_binary=True)
node_new = cluster.add_instance('node_new')
@pytest.fixture(scope="module")
def setup_nodes():
try:
cluster.start()
for n in (node18_14, node19_1, node19_4, node19_6, node_new):
for n in (node18_14, node19_1, node19_4, node19_6, node19_8, node_new):
n.query('''CREATE TABLE test_table (id UInt32, value UInt64) ENGINE = MergeTree() ORDER BY tuple()''')
yield cluster
@ -29,7 +30,7 @@ def query_from_one_node_to_another(client_node, server_node, query):
client_node.exec_in_container(["bash", "-c", "/usr/bin/clickhouse client --host {} --query '{}'".format(server_node.name, query)])
def test_client_from_different_versions(setup_nodes):
old_nodes = (node18_14, node19_1, node19_4, node19_6,)
old_nodes = (node18_14, node19_1, node19_4, node19_6, node19_8)
# from new to old
for n in old_nodes:
query_from_one_node_to_another(node_new, n, "INSERT INTO test_table VALUES (1, 1)")

View File

@ -0,0 +1,53 @@
9 abra
14 abracadabra
"rows_read": 6,
8 computer science
"rows_read": 2,
9 abra
10 cadabra
11 crabacadabra
14 abracadabra
15 cadabraabra
"rows_read": 6,
6 some string
7 another string
"rows_read": 2,
9 abra
14 abracadabra
"rows_read": 6,
8 computer science
"rows_read": 2,
1 ClickHouse is a column-oriented database management system (DBMS)
2 column-oriented database management system
13 basement
"rows_read": 6,
6 some string
7 another string
"rows_read": 2,
6 some string
7 another string
8 computer science
"rows_read": 4,
1 ClickHouse is a column-oriented database management system (DBMS)
2 column-oriented database management system
13 basement
"rows_read": 6,
9 abra
10 cadabra
11 crabacadabra
14 abracadabra
15 cadabraabra
"rows_read": 6,
4 какая-то строка
5 еще строка
6 some string
7 another string
"rows_read": 4,
14 abracadabra
"rows_read": 4,
1 ClickHouse is a column-oriented database management system (DBMS)
2 column-oriented database management system
10 cadabra
11 crabacadabra
15 cadabraabra
"rows_read": 8,

View File

@ -0,0 +1,86 @@
#!/usr/bin/env bash
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
. $CURDIR/../shell_config.sh
$CLICKHOUSE_CLIENT --query="DROP TABLE IF EXISTS bloom_filter_idx;"
# NGRAM BF
$CLICKHOUSE_CLIENT -n --query="
SET allow_experimental_data_skipping_indices = 1;
CREATE TABLE bloom_filter_idx
(
k UInt64,
s String,
INDEX bf (s, lower(s)) TYPE ngrambf_v1(3, 512, 2, 0) GRANULARITY 1
) ENGINE = MergeTree()
ORDER BY k
SETTINGS index_granularity = 2;"
$CLICKHOUSE_CLIENT --query="INSERT INTO bloom_filter_idx VALUES
(0, 'ClickHouse - столбцовая система управления базами данных (СУБД)'),
(1, 'ClickHouse is a column-oriented database management system (DBMS)'),
(2, 'column-oriented database management system'),
(3, 'columns'),
(4, 'какая-то строка'),
(5, 'еще строка'),
(6, 'some string'),
(7, 'another string'),
(8, 'computer science'),
(9, 'abra'),
(10, 'cadabra'),
(11, 'crabacadabra'),
(12, 'crab'),
(13, 'basement'),
(14, 'abracadabra'),
(15, 'cadabraabra')"
# STARTS_WITH
$CLICKHOUSE_CLIENT --query="SELECT * FROM bloom_filter_idx WHERE startsWith(s, 'abra') ORDER BY k"
$CLICKHOUSE_CLIENT --query="SELECT * FROM bloom_filter_idx WHERE startsWith(s, 'abra') ORDER BY k FORMAT JSON" | grep "rows_read"
$CLICKHOUSE_CLIENT --query="SELECT * FROM bloom_filter_idx WHERE startsWith(s, 'computer') ORDER BY k"
$CLICKHOUSE_CLIENT --query="SELECT * FROM bloom_filter_idx WHERE startsWith(s, 'computer') ORDER BY k FORMAT JSON" | grep "rows_read"
# ENDS_WITH
$CLICKHOUSE_CLIENT --query="SELECT * FROM bloom_filter_idx WHERE endsWith(s, 'abra') ORDER BY k"
$CLICKHOUSE_CLIENT --query="SELECT * FROM bloom_filter_idx WHERE endsWith(s, 'abra') ORDER BY k FORMAT JSON" | grep "rows_read"
$CLICKHOUSE_CLIENT --query="SELECT * FROM bloom_filter_idx WHERE endsWith(s, 'ring') ORDER BY k"
$CLICKHOUSE_CLIENT --query="SELECT * FROM bloom_filter_idx WHERE endsWith(s, 'ring') ORDER BY k FORMAT JSON" | grep "rows_read"
# COMBINED
$CLICKHOUSE_CLIENT --query="SELECT * FROM bloom_filter_idx WHERE startsWith(s, 'abra') AND endsWith(s, 'abra')"
$CLICKHOUSE_CLIENT --query="SELECT * FROM bloom_filter_idx WHERE startsWith(s, 'abra') AND endsWith(s, 'abra') FORMAT JSON" | grep "rows_read"
$CLICKHOUSE_CLIENT --query="SELECT * FROM bloom_filter_idx WHERE startsWith(s, 'c') AND endsWith(s, 'science')"
$CLICKHOUSE_CLIENT --query="SELECT * FROM bloom_filter_idx WHERE startsWith(s, 'c') AND endsWith(s, 'science') FORMAT JSON" | grep "rows_read"
# MULTY_SEARCH_ANY
$CLICKHOUSE_CLIENT --query="SELECT * FROM bloom_filter_idx WHERE multiSearchAny(s, ['data', 'base'])"
$CLICKHOUSE_CLIENT --query="SELECT * FROM bloom_filter_idx WHERE multiSearchAny(s, ['data', 'base']) FORMAT JSON" | grep "rows_read"
$CLICKHOUSE_CLIENT --query="SELECT * FROM bloom_filter_idx WHERE multiSearchAny(s, ['string'])"
$CLICKHOUSE_CLIENT --query="SELECT * FROM bloom_filter_idx WHERE multiSearchAny(s, ['string']) FORMAT JSON" | grep "rows_read"
$CLICKHOUSE_CLIENT --query="SELECT * FROM bloom_filter_idx WHERE multiSearchAny(s, ['string', 'computer'])"
$CLICKHOUSE_CLIENT --query="SELECT * FROM bloom_filter_idx WHERE multiSearchAny(s, ['string', 'computer']) FORMAT JSON" | grep "rows_read"
$CLICKHOUSE_CLIENT --query="SELECT * FROM bloom_filter_idx WHERE multiSearchAny(s, ['base', 'seme', 'gement'])"
$CLICKHOUSE_CLIENT --query="SELECT * FROM bloom_filter_idx WHERE multiSearchAny(s, ['base', 'seme', 'gement']) FORMAT JSON" | grep "rows_read"
$CLICKHOUSE_CLIENT --query="SELECT * FROM bloom_filter_idx WHERE multiSearchAny(s, ['abra', 'cadabra', 'cab', 'extra'])"
$CLICKHOUSE_CLIENT --query="SELECT * FROM bloom_filter_idx WHERE multiSearchAny(s, ['abra', 'cadabra', 'cab', 'extra']) FORMAT JSON" | grep "rows_read"
$CLICKHOUSE_CLIENT --query="SELECT * FROM bloom_filter_idx WHERE multiSearchAny(s, ['строка', 'string'])"
$CLICKHOUSE_CLIENT --query="SELECT * FROM bloom_filter_idx WHERE multiSearchAny(s, ['строка', 'string']) FORMAT JSON" | grep "rows_read"
# MULTY_SEARCH_ANY + OTHER
$CLICKHOUSE_CLIENT --query="SELECT * FROM bloom_filter_idx WHERE multiSearchAny(s, ['adab', 'cad', 'aba']) AND startsWith(s, 'abra')"
$CLICKHOUSE_CLIENT --query="SELECT * FROM bloom_filter_idx WHERE multiSearchAny(s, ['adab', 'cad', 'aba']) AND startsWith(s, 'abra') FORMAT JSON" | grep "rows_read"
$CLICKHOUSE_CLIENT --query="SELECT * FROM bloom_filter_idx WHERE multiSearchAny(s, ['adab', 'cad', 'aba']) AND (startsWith(s, 'c') OR startsWith(s, 'C'))"
$CLICKHOUSE_CLIENT --query="SELECT * FROM bloom_filter_idx WHERE multiSearchAny(s, ['adab', 'cad', 'aba']) AND (startsWith(s, 'c') OR startsWith(s, 'C')) FORMAT JSON" | grep "rows_read"
$CLICKHOUSE_CLIENT --query="DROP TABLE bloom_filter_idx;"

View File

@ -0,0 +1,16 @@
9 abra
14 abracadabra
"rows_read": 4,
9 abra
10 cadabra
11 crabacadabra
14 abracadabra
15 cadabraabra
"rows_read": 6,
9 abra
14 abracadabra
"rows_read": 4,
1 ClickHouse is a column-oriented database management system (DBMS)
2 column-oriented database management system
13 basement
"rows_read": 6,

View File

@ -0,0 +1,53 @@
#!/usr/bin/env bash
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
. $CURDIR/../shell_config.sh
$CLICKHOUSE_CLIENT --query="DROP TABLE IF EXISTS set_idx;"
$CLICKHOUSE_CLIENT -n --query="
SET allow_experimental_data_skipping_indices = 1;
CREATE TABLE set_idx
(
k UInt64,
s String,
INDEX idx (s) TYPE set(2) GRANULARITY 1
) ENGINE = MergeTree()
ORDER BY k
SETTINGS index_granularity = 2;"
$CLICKHOUSE_CLIENT --query="INSERT INTO set_idx VALUES
(0, 'ClickHouse - столбцовая система управления базами данных (СУБД)'),
(1, 'ClickHouse is a column-oriented database management system (DBMS)'),
(2, 'column-oriented database management system'),
(3, 'columns'),
(4, 'какая-то строка'),
(5, 'еще строка'),
(6, 'some string'),
(7, 'another string'),
(8, 'computer science'),
(9, 'abra'),
(10, 'cadabra'),
(11, 'crabacadabra'),
(12, 'crab'),
(13, 'basement'),
(14, 'abracadabra'),
(15, 'cadabraabra')"
# STARTS_WITH
$CLICKHOUSE_CLIENT --query="SELECT * FROM set_idx WHERE startsWith(s, 'abra')"
$CLICKHOUSE_CLIENT --query="SELECT * FROM set_idx WHERE startsWith(s, 'abra') FORMAT JSON" | grep "rows_read"
# ENDS_WITH
$CLICKHOUSE_CLIENT --query="SELECT * FROM set_idx WHERE endsWith(s, 'abra')"
$CLICKHOUSE_CLIENT --query="SELECT * FROM set_idx WHERE endsWith(s, 'abra') FORMAT JSON" | grep "rows_read"
# COMBINED
$CLICKHOUSE_CLIENT --query="SELECT * FROM set_idx WHERE startsWith(s, 'abra') AND endsWith(s, 'abra')"
$CLICKHOUSE_CLIENT --query="SELECT * FROM set_idx WHERE startsWith(s, 'abra') AND endsWith(s, 'abra') FORMAT JSON" | grep "rows_read"
# MULTY_SEARCH_ANY
$CLICKHOUSE_CLIENT --query="SELECT * FROM set_idx WHERE multiSearchAny(s, ['data', 'base'])"
$CLICKHOUSE_CLIENT --query="SELECT * FROM set_idx WHERE multiSearchAny(s, ['data', 'base']) FORMAT JSON" | grep "rows_read"
$CLICKHOUSE_CLIENT --query="DROP TABLE set_idx;"

View File

@ -0,0 +1,22 @@
DROP TABLE IF EXISTS merge_tree;
CREATE TABLE merge_tree (x UInt8) ENGINE = MergeTree ORDER BY x;
INSERT INTO merge_tree SELECT 0 FROM numbers(1000000);
SET max_threads = 4;
SET max_rows_to_read = 1100000;
SET merge_tree_uniform_read_distribution = 1;
SELECT count() FROM merge_tree;
SET merge_tree_uniform_read_distribution = 0;
SELECT count() FROM merge_tree;
SET max_rows_to_read = 900000;
SET merge_tree_uniform_read_distribution = 1;
SELECT count() FROM merge_tree; -- { serverError 158 }
SET merge_tree_uniform_read_distribution = 0;
SELECT count() FROM merge_tree; -- { serverError 158 }
DROP TABLE merge_tree;

View File

@ -0,0 +1,9 @@
#!/usr/bin/env bash
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
. $CURDIR/../shell_config.sh
set -e
# No log lines without query id
$CLICKHOUSE_CLIENT --send_logs_level=trace --query_id=hello --query="SELECT count() FROM numbers(10)" 2>&1 | grep -vF ' {hello} ' | grep -P '<\w+>' ||:

View File

@ -117,4 +117,10 @@
<path>/clickhouse/task_queue/ddl</path>
</distributed_ddl>
<format_schema_path>/tmp/clickhouse/data/format_schemas/</format_schema_path>
<query_masking_rules>
<rule>
<regexp>TOPSECRET.TOPSECRET</regexp>
<replace>[hidden]</replace>
</rule>
</query_masking_rules>
</yandex>

View File

@ -39,6 +39,7 @@ CMD dpkg -i package_folder/clickhouse-common-static_*.deb; \
ln -s /usr/share/clickhouse-test/config/zookeeper.xml /etc/clickhouse-server/config.d/; \
ln -s /usr/share/clickhouse-test/config/listen.xml /etc/clickhouse-server/config.d/; \
ln -s /usr/share/clickhouse-test/config/part_log.xml /etc/clickhouse-server/config.d/; \
ln -s /usr/share/clickhouse-test/config/query_masking_rules.xml /etc/clickhouse-server/config.d/; \
ln -s /usr/share/clickhouse-test/config/log_queries.xml /etc/clickhouse-server/users.d/; \
ln -s /usr/share/clickhouse-test/config/readonly.xml /etc/clickhouse-server/users.d/; \
ln -s /usr/share/clickhouse-test/config/ints_dictionary.xml /etc/clickhouse-server/; \

View File

@ -323,7 +323,7 @@ When using this format, ClickHouse outputs rows as separated, newline-delimited
```json
{"SearchPhrase":"curtain designs","count()":"1064"}
{"SearchPhrase":"baku","count()":"1000"}
{"SearchPhrase":"","count":"8267016"}
{"SearchPhrase":"","count()":"8267016"}
```
When inserting the data, you should provide a separate JSON object for each row.
@ -386,6 +386,60 @@ Unlike the [JSON](#json) format, there is no substitution of invalid UTF-8 seque
!!! note "Note"
Any set of bytes can be output in the strings. Use the `JSONEachRow` format if you are sure that the data in the table can be formatted as JSON without losing any information.
### Usage of Nested Structures {#jsoneachrow-nested}
If you have a table with the [Nested](../data_types/nested_data_structures/nested.md) data type columns, you can insert JSON data having the same structure. Enable this functionality with the [input_format_import_nested_json](../operations/settings/settings.md#settings-input_format_import_nested_json) setting.
For example, consider the following table:
```sql
CREATE TABLE json_each_row_nested (n Nested (s String, i Int32) ) ENGINE = Memory
```
As you can find in the `Nested` data type description, ClickHouse treats each component of the nested structure as a separate column, `n.s` and `n.i` for our table. So you can insert the data the following way:
```sql
INSERT INTO json_each_row_nested FORMAT JSONEachRow {"n.s": ["abc", "def"], "n.i": [1, 23]}
```
To insert data as hierarchical JSON object set [input_format_import_nested_json=1](../operations/settings/settings.md#settings-input_format_import_nested_json).
```json
{
"n": {
"s": ["abc", "def"],
"i": [1, 23]
}
}
```
Without this setting ClickHouse throws the exception.
```sql
SELECT name, value FROM system.settings WHERE name = 'input_format_import_nested_json'
```
```text
┌─name────────────────────────────┬─value─┐
│ input_format_import_nested_json │ 0 │
└─────────────────────────────────┴───────┘
```
```sql
INSERT INTO json_each_row_nested FORMAT JSONEachRow {"n": {"s": ["abc", "def"], "i": [1, 23]}}
```
```text
Code: 117. DB::Exception: Unknown field found while parsing JSONEachRow format: n: (at row 1)
```
```sql
SET input_format_import_nested_json=1
INSERT INTO json_each_row_nested FORMAT JSONEachRow {"n": {"s": ["abc", "def"], "i": [1, 23]}}
SELECT * FROM json_each_row_nested
```
```text
┌─n.s───────────┬─n.i────┐
│ ['abc','def'] │ [1,23] │
└───────────────┴────────┘
```
## Native {#native}
The most efficient format. Data is written and read by blocks in binary format. For each block, the number of rows, number of columns, column names and types, and parts of columns in this block are recorded one after another. In other words, this format is "columnar" it doesn't convert columns to rows. This is the format used in the native interface for interaction between servers, for using the command-line client, and for C++ clients.

View File

@ -231,6 +231,25 @@ Possible values:
Default value: 0.
## input_format_import_nested_json {#settings-input_format_import_nested_json}
Enables or disables inserting of JSON data with nested objects.
Supported formats:
- [JSONEachRow](../../interfaces/formats.md#jsoneachrow)
Possible values:
- 0 — Disabled.
- 1 — Enabled.
Default value: 0.
**See Also**
- [Usage of Nested Structures](../../interfaces/formats.md#jsoneachrow-nested) with the `JSONEachRow` format.
## input_format_with_names_use_header {#settings-input_format_with_names_use_header}
Enables or disables checking the column order when inserting data.
@ -249,6 +268,27 @@ Possible values:
Default value: 1.
## date_time_input_format {#settings-date_time_input_format}
Enables or disables extended parsing of date and time formatted strings.
The setting doesn't apply to [date and time functions](../../query_language/functions/date_time_functions.md).
Possible values:
- `'best_effort'` — Enables extended parsing.
ClickHouse can parse the basic format `YYYY-MM-DD HH:MM:SS` and all the [ISO 8601](https://en.wikipedia.org/wiki/ISO_8601) date and time formats. For example, `'2018-06-08T01:02:03.000Z'`.
- `'basic'` — Use basic parser.
ClickHouse can parse only the basic format.
**See Also**
- [DateTime data type.](../../data_types/datetime.md)
- [Functions for working with dates and times.](../../query_language/functions/date_time_functions.md)
## join_default_strictness {#settings-join_default_strictness}
Sets default strictness for [JOIN clauses](../../query_language/select.md#select-join).

View File

@ -58,11 +58,10 @@ arrayConcat(arrays)
- `arrays` Arbitrary number of arguments of [Array](../../data_types/array.md) type.
**Example**
``` sql
```sql
SELECT arrayConcat([1, 2], [3, 4], [5, 6]) AS res
```
```
```text
┌─res───────────┐
│ [1,2,3,4,5,6] │
└───────────────┘
@ -204,7 +203,7 @@ Returns the array \[1, 2, 3, ..., length (arr) \]
This function is normally used with ARRAY JOIN. It allows counting something just once for each array after applying ARRAY JOIN. Example:
``` sql
```sql
SELECT
count() AS Reaches,
countIf(num = 1) AS Hits
@ -215,8 +214,7 @@ ARRAY JOIN
WHERE CounterID = 160656
LIMIT 10
```
```
```text
┌─Reaches─┬──Hits─┐
│ 95606 │ 31406 │
└─────────┴───────┘
@ -224,15 +222,14 @@ LIMIT 10
In this example, Reaches is the number of conversions (the strings received after applying ARRAY JOIN), and Hits is the number of pageviews (strings before ARRAY JOIN). In this particular case, you can get the same result in an easier way:
``` sql
```sql
SELECT
sum(length(GoalsReached)) AS Reaches,
count() AS Hits
FROM test.hits
WHERE (CounterID = 160656) AND notEmpty(GoalsReached)
```
```
```text
┌─Reaches─┬──Hits─┐
│ 95606 │ 31406 │
└─────────┴───────┘
@ -248,7 +245,7 @@ For example: arrayEnumerateUniq(\[10, 20, 10, 30\]) = \[1, 1, 2, 1\].
This function is useful when using ARRAY JOIN and aggregation of array elements.
Example:
``` sql
```sql
SELECT
Goals.ID AS GoalID,
sum(Sign) AS Reaches,
@ -262,8 +259,7 @@ GROUP BY GoalID
ORDER BY Reaches DESC
LIMIT 10
```
```
```text
┌──GoalID─┬─Reaches─┬─Visits─┐
│ 53225 │ 3214 │ 1097 │
│ 2825062 │ 3188 │ 1097 │
@ -282,11 +278,10 @@ In this example, each goal ID has a calculation of the number of conversions (ea
The arrayEnumerateUniq function can take multiple arrays of the same size as arguments. In this case, uniqueness is considered for tuples of elements in the same positions in all the arrays.
``` sql
```sql
SELECT arrayEnumerateUniq([1, 1, 1, 2, 2, 2], [1, 1, 2, 1, 1, 2]) AS res
```
```
```text
┌─res───────────┐
│ [1,2,1,1,2,1] │
└───────────────┘
@ -308,11 +303,10 @@ arrayPopBack(array)
**Example**
``` sql
```sql
SELECT arrayPopBack([1, 2, 3]) AS res
```
```
```text
┌─res───┐
│ [1,2] │
└───────┘
@ -332,11 +326,10 @@ arrayPopFront(array)
**Example**
``` sql
```sql
SELECT arrayPopFront([1, 2, 3]) AS res
```
```
```text
┌─res───┐
│ [2,3] │
└───────┘
@ -357,11 +350,10 @@ arrayPushBack(array, single_value)
**Example**
``` sql
```sql
SELECT arrayPushBack(['a'], 'b') AS res
```
```
```text
┌─res───────┐
│ ['a','b'] │
└───────────┘
@ -382,11 +374,10 @@ arrayPushFront(array, single_value)
**Example**
``` sql
```sql
SELECT arrayPushBack(['b'], 'a') AS res
```
```
```text
┌─res───────┐
│ ['a','b'] │
└───────────┘
@ -446,11 +437,10 @@ arraySlice(array, offset[, length])
**Example**
``` sql
```sql
SELECT arraySlice([1, 2, NULL, 4, 5], 2, 3) AS res
```
```
```text
┌─res────────┐
│ [2,NULL,4] │
└────────────┘
@ -464,10 +454,10 @@ Sorts the elements of the `arr` array in ascending order. If the `func` function
Example of integer values sorting:
``` sql
```sql
SELECT arraySort([1, 3, 3, 0]);
```
```
```text
┌─arraySort([1, 3, 3, 0])─┐
│ [0,1,3,3] │
└─────────────────────────┘
@ -475,10 +465,10 @@ SELECT arraySort([1, 3, 3, 0]);
Example of string values sorting:
``` sql
```sql
SELECT arraySort(['hello', 'world', '!']);
```
```
```text
┌─arraySort(['hello', 'world', '!'])─┐
│ ['!','hello','world'] │
└────────────────────────────────────┘
@ -486,10 +476,10 @@ SELECT arraySort(['hello', 'world', '!']);
Consider the following sorting order for the `NULL`, `NaN` and `Inf` values:
``` sql
```sql
SELECT arraySort([1, nan, 2, NULL, 3, nan, -4, NULL, inf, -inf]);
```
```
```text
┌─arraySort([1, nan, 2, NULL, 3, nan, -4, NULL, inf, -inf])─┐
│ [-inf,-4,1,2,3,inf,nan,nan,NULL,NULL] │
└───────────────────────────────────────────────────────────┘
@ -504,10 +494,10 @@ Note that `arraySort` is a [higher-order function](higher_order_functions.md). Y
Let's consider the following example:
``` sql
```sql
SELECT arraySort((x) -> -x, [1, 2, 3]) as res;
```
```
```text
┌─res─────┐
│ [3,2,1] │
└─────────┘
@ -517,11 +507,10 @@ For each element of the source array, the lambda function returns the sorting ke
The lambda function can accept multiple arguments. In this case, you need to pass the `arraySort` function several arrays of identical length that the arguments of lambda function will correspond to. The resulting array will consist of elements from the first input array; elements from the next input array(s) specify the sorting keys. For example:
``` sql
```sql
SELECT arraySort((x, y) -> y, ['hello', 'world'], [2, 1]) as res;
```
```
```text
┌─res────────────────┐
│ ['world', 'hello'] │
└────────────────────┘
@ -531,19 +520,19 @@ Here, the elements that are passed in the second array ([2, 1]) define a sorting
Other examples are shown below.
``` sql
```sql
SELECT arraySort((x, y) -> y, [0, 1, 2], ['c', 'b', 'a']) as res;
```
``` sql
```text
┌─res─────┐
│ [2,1,0] │
└─────────┘
```
``` sql
```sql
SELECT arraySort((x, y) -> -y, [0, 1, 2], [1, 2, 3]) as res;
```
``` sql
```text
┌─res─────┐
│ [2,1,0] │
└─────────┘
@ -558,10 +547,10 @@ Sorts the elements of the `arr` array in descending order. If the `func` functio
Example of integer values sorting:
``` sql
```sql
SELECT arrayReverseSort([1, 3, 3, 0]);
```
```
```text
┌─arrayReverseSort([1, 3, 3, 0])─┐
│ [3,3,1,0] │
└────────────────────────────────┘
@ -569,10 +558,10 @@ SELECT arrayReverseSort([1, 3, 3, 0]);
Example of string values sorting:
``` sql
```sql
SELECT arrayReverseSort(['hello', 'world', '!']);
```
```
```text
┌─arrayReverseSort(['hello', 'world', '!'])─┐
│ ['world','hello','!'] │
└───────────────────────────────────────────┘
@ -580,10 +569,10 @@ SELECT arrayReverseSort(['hello', 'world', '!']);
Consider the following sorting order for the `NULL`, `NaN` and `Inf` values:
``` sql
```sql
SELECT arrayReverseSort([1, nan, 2, NULL, 3, nan, -4, NULL, inf, -inf]) as res;
```
``` sql
```text
┌─res───────────────────────────────────┐
│ [inf,3,2,1,-4,-inf,nan,nan,NULL,NULL] │
└───────────────────────────────────────┘
@ -596,10 +585,10 @@ SELECT arrayReverseSort([1, nan, 2, NULL, 3, nan, -4, NULL, inf, -inf]) as res;
Note that the `arrayReverseSort` is a [higher-order function](higher_order_functions.md). You can pass a lambda function to it as the first argument. Example is shown below.
``` sql
```sql
SELECT arrayReverseSort((x) -> -x, [1, 2, 3]) as res;
```
```
```text
┌─res─────┐
│ [1,2,3] │
└─────────┘
@ -612,10 +601,10 @@ The array is sorted in the following way:
The lambda function can accept multiple arguments. In this case, you need to pass the `arrayReverseSort` function several arrays of identical length that the arguments of lambda function will correspond to. The resulting array will consist of elements from the first input array; elements from the next input array(s) specify the sorting keys. For example:
``` sql
```sql
SELECT arrayReverseSort((x, y) -> y, ['hello', 'world'], [2, 1]) as res;
```
``` sql
```text
┌─res───────────────┐
│ ['hello','world'] │
└───────────────────┘
@ -628,18 +617,18 @@ In this example, the array is sorted in the following way:
Other examples are shown below.
``` sql
```sql
SELECT arrayReverseSort((x, y) -> y, [4, 3, 5], ['a', 'b', 'c']) AS res;
```
``` sql
```text
┌─res─────┐
│ [5,3,4] │
└─────────┘
```
``` sql
```sql
SELECT arrayReverseSort((x, y) -> -y, [4, 3, 5], [1, 2, 3]) AS res;
```
``` sql
```text
┌─res─────┐
│ [4,3,5] │
└─────────┘

View File

@ -212,7 +212,7 @@ SELECT metroHash64(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23:00:
## jumpConsistentHash
Calculates JumpConsistentHash form a UInt64.
Accepts a UInt64-type argument. Returns Int32.
Accepts two arguments: a UInt64-type key and the number of buckets. Returns Int32.
For more information, see the link: [JumpConsistentHash](https://arxiv.org/pdf/1406.2294.pdf)
## murmurHash2_32, murmurHash2_64

View File

@ -328,6 +328,60 @@ JSON با جاوااسکریپت سازگار است. برای اطمینان ا
برای پارس کردن، هر ترتیبی برای مقادیر ستون های مختلف پشتیبانی می شود. حذف شدن بعضی مقادیر قابل قبول است، آنها با مقادیر پیش فرض خود برابر هستند. در این مورد، صفر و سطر های خالی به عنوان مقادیر پیش فرض قرار می گیرند. مقادیر پیچیده که می توانند در جدول مشخص شوند، به عنوان مقادیر پیش فرض پشتیبانی نمی شوند. Whitespace بین element ها نادیده گرفته می شوند. اگر کاما بعد از object ها قرار گیرند، نادیده گرفته می شوند. object ها نیازی به جداسازی با استفاده از new line را ندارند.
### Usage of Nested Structures {#jsoneachrow-nested}
If you have a table with the [Nested](../data_types/nested_data_structures/nested.md) data type columns, you can insert JSON data having the same structure. Enable this functionality with the [input_format_import_nested_json](../operations/settings/settings.md#settings-input_format_import_nested_json) setting.
For example, consider the following table:
```sql
CREATE TABLE json_each_row_nested (n Nested (s String, i Int32) ) ENGINE = Memory
```
As you can find in the `Nested` data type description, ClickHouse treats each component of the nested structure as a separate column, `n.s` and `n.i` for our table. So you can insert the data the following way:
```sql
INSERT INTO json_each_row_nested FORMAT JSONEachRow {"n.s": ["abc", "def"], "n.i": [1, 23]}
```
To insert data as hierarchical JSON object set [input_format_import_nested_json=1](../operations/settings/settings.md#settings-input_format_import_nested_json).
```json
{
"n": {
"s": ["abc", "def"],
"i": [1, 23]
}
}
```
Without this setting ClickHouse throws the exception.
```sql
SELECT name, value FROM system.settings WHERE name = 'input_format_import_nested_json'
```
```text
┌─name────────────────────────────┬─value─┐
│ input_format_import_nested_json │ 0 │
└─────────────────────────────────┴───────┘
```
```sql
INSERT INTO json_each_row_nested FORMAT JSONEachRow {"n": {"s": ["abc", "def"], "i": [1, 23]}}
```
```text
Code: 117. DB::Exception: Unknown field found while parsing JSONEachRow format: n: (at row 1)
```
```sql
SET input_format_import_nested_json=1
INSERT INTO json_each_row_nested FORMAT JSONEachRow {"n": {"s": ["abc", "def"], "i": [1, 23]}}
SELECT * FROM json_each_row_nested
```
```text
┌─n.s───────────┬─n.i────┐
│ ['abc','def'] │ [1,23] │
└───────────────┴────────┘
```
## Native
کارآمدترین فرمت. داده ها توسط بلاک ها و در فرمت باینری نوشته و خوانده می شوند. برای هر بلاک، تعداد سطرها، تعداد ستون ها، نام ستون ها و type آنها، و بخش هایی از ستون ها در این بلاک یکی پس از دیگری ثبت می شوند. به عبارت دیگر، این فرمت "columnar" است - این فرمت ستون ها را به سطر تبدیل نمی کند. این فرمت در حالت native interface و بین سرور و محیط ترمینال و همچنین کلاینت C++ استفاده می شود.

View File

@ -327,6 +327,60 @@ ClickHouse 支持 [NULL](../query_language/syntax.md), 在 JSON 格式中以 `nu
对于解析,任何顺序都支持不同列的值。可以省略某些值 - 它们被视为等于它们的默认值。在这种情况下,零和空行被用作默认值。 作为默认值,不支持表中指定的复杂值。元素之间的空白字符被忽略。如果在对象之后放置逗号,它将被忽略。对象不一定必须用新行分隔。
### Usage of Nested Structures {#jsoneachrow-nested}
If you have a table with the [Nested](../data_types/nested_data_structures/nested.md) data type columns, you can insert JSON data having the same structure. Enable this functionality with the [input_format_import_nested_json](../operations/settings/settings.md#settings-input_format_import_nested_json) setting.
For example, consider the following table:
```sql
CREATE TABLE json_each_row_nested (n Nested (s String, i Int32) ) ENGINE = Memory
```
As you can find in the `Nested` data type description, ClickHouse treats each component of the nested structure as a separate column, `n.s` and `n.i` for our table. So you can insert the data the following way:
```sql
INSERT INTO json_each_row_nested FORMAT JSONEachRow {"n.s": ["abc", "def"], "n.i": [1, 23]}
```
To insert data as hierarchical JSON object set [input_format_import_nested_json=1](../operations/settings/settings.md#settings-input_format_import_nested_json).
```json
{
"n": {
"s": ["abc", "def"],
"i": [1, 23]
}
}
```
Without this setting ClickHouse throws the exception.
```sql
SELECT name, value FROM system.settings WHERE name = 'input_format_import_nested_json'
```
```text
┌─name────────────────────────────┬─value─┐
│ input_format_import_nested_json │ 0 │
└─────────────────────────────────┴───────┘
```
```sql
INSERT INTO json_each_row_nested FORMAT JSONEachRow {"n": {"s": ["abc", "def"], "i": [1, 23]}}
```
```text
Code: 117. DB::Exception: Unknown field found while parsing JSONEachRow format: n: (at row 1)
```
```sql
SET input_format_import_nested_json=1
INSERT INTO json_each_row_nested FORMAT JSONEachRow {"n": {"s": ["abc", "def"], "i": [1, 23]}}
SELECT * FROM json_each_row_nested
```
```text
┌─n.s───────────┬─n.i────┐
│ ['abc','def'] │ [1,23] │
└───────────────┴────────┘
```
## Native {#native}
最高性能的格式。 据通过二进制格式的块进行写入和读取。对于每个块,该块中的行数,列数,列名称和类型以及列的部分将被相继记录。 换句话说,这种格式是 “列式”的 - 它不会将列转换为行。 这是用于在服务器之间进行交互的本地界面中使用的格式,用于使用命令行客户端和 C++ 客户端。