From c9a5b3c9ab5f1617bf3913ffc0378d73c405b1b6 Mon Sep 17 00:00:00 2001 From: dimarub2000 Date: Wed, 10 Jul 2019 17:53:57 +0300 Subject: [PATCH 01/26] primary key and MergeTreeIndexFullText support for string functions --- dbms/src/Storages/MergeTree/KeyCondition.cpp | 192 +++++++++++------- .../MergeTree/MergeTreeIndexFullText.cpp | 154 ++++++++++---- .../MergeTree/MergeTreeIndexFullText.h | 4 +- 3 files changed, 240 insertions(+), 110 deletions(-) diff --git a/dbms/src/Storages/MergeTree/KeyCondition.cpp b/dbms/src/Storages/MergeTree/KeyCondition.cpp index 22cf7cbadad..7993ac4555a 100644 --- a/dbms/src/Storages/MergeTree/KeyCondition.cpp +++ b/dbms/src/Storages/MergeTree/KeyCondition.cpp @@ -178,6 +178,24 @@ const KeyCondition::AtomMap KeyCondition::atom_map return true; } }, + { + "empty", + [] (RPNElement & out, const Field &) + { + out.function = RPNElement::FUNCTION_IN_RANGE; + out.range = Range(""); + return true; + } + }, + { + "notEmpty", + [] (RPNElement & out, const Field &) + { + out.function = RPNElement::FUNCTION_NOT_IN_RANGE; + out.range = Range(""); + return true; + } + }, { "like", [] (RPNElement & out, const Field & value) @@ -199,6 +217,27 @@ const KeyCondition::AtomMap KeyCondition::atom_map return true; } }, + { + "notLike", + [] (RPNElement & out, const Field & value) + { + if (value.getType() != Field::Types::String) + return false; + + String prefix = extractFixedPrefixFromLikePattern(value.get()); + if (prefix.empty()) + return false; + + String right_bound = firstStringThatIsGreaterThanAllStringsWithPrefix(prefix); + + out.function = RPNElement::FUNCTION_NOT_IN_RANGE; + out.range = !right_bound.empty() + ? Range(prefix, true, right_bound, false) + : Range::createLeftBounded(prefix, true); + + return true; + } + }, { "startsWith", [] (RPNElement & out, const Field & value) @@ -645,92 +684,102 @@ bool KeyCondition::atomFromAST(const ASTPtr & node, const Context & context, Blo { const ASTs & args = func->arguments->children; - if (args.size() != 2) - return false; - DataTypePtr key_expr_type; /// Type of expression containing key column - size_t key_arg_pos; /// Position of argument with key column (non-const argument) size_t key_column_num = -1; /// Number of a key column (inside key_column_names array) MonotonicFunctionsChain chain; - bool is_set_const = false; - bool is_constant_transformed = false; + std::string func_name = func->name; - if (functionIsInOrGlobalInOperator(func->name) - && tryPrepareSetIndex(args, context, out, key_column_num)) + if (args.size() == 1) { - key_arg_pos = 0; - is_set_const = true; + if (!(isKeyPossiblyWrappedByMonotonicFunctions(args[0], context, key_column_num, key_expr_type, chain))) + return false; + + if (key_column_num == static_cast(-1)) + throw Exception("`key_column_num` wasn't initialized. It is a bug.", ErrorCodes::LOGICAL_ERROR); } - else if (getConstant(args[1], block_with_constants, const_value, const_type) - && isKeyPossiblyWrappedByMonotonicFunctions(args[0], context, key_column_num, key_expr_type, chain)) + else if (args.size() == 2) { - key_arg_pos = 0; - } - else if (getConstant(args[1], block_with_constants, const_value, const_type) - && canConstantBeWrappedByMonotonicFunctions(args[0], key_column_num, key_expr_type, const_value, const_type)) - { - key_arg_pos = 0; - is_constant_transformed = true; - } - else if (getConstant(args[0], block_with_constants, const_value, const_type) - && isKeyPossiblyWrappedByMonotonicFunctions(args[1], context, key_column_num, key_expr_type, chain)) - { - key_arg_pos = 1; - } - else if (getConstant(args[0], block_with_constants, const_value, const_type) - && canConstantBeWrappedByMonotonicFunctions(args[1], key_column_num, key_expr_type, const_value, const_type)) - { - key_arg_pos = 1; - is_constant_transformed = true; + size_t key_arg_pos; /// Position of argument with key column (non-const argument) + bool is_set_const = false; + bool is_constant_transformed = false; + + if (functionIsInOrGlobalInOperator(func_name) + && tryPrepareSetIndex(args, context, out, key_column_num)) + { + key_arg_pos = 0; + is_set_const = true; + } + else if (getConstant(args[1], block_with_constants, const_value, const_type) + && isKeyPossiblyWrappedByMonotonicFunctions(args[0], context, key_column_num, key_expr_type, chain)) + { + key_arg_pos = 0; + } + else if (getConstant(args[1], block_with_constants, const_value, const_type) + && canConstantBeWrappedByMonotonicFunctions(args[0], key_column_num, key_expr_type, const_value, const_type)) + { + key_arg_pos = 0; + is_constant_transformed = true; + } + else if (getConstant(args[0], block_with_constants, const_value, const_type) + && isKeyPossiblyWrappedByMonotonicFunctions(args[1], context, key_column_num, key_expr_type, chain)) + { + key_arg_pos = 1; + } + else if (getConstant(args[0], block_with_constants, const_value, const_type) + && canConstantBeWrappedByMonotonicFunctions(args[1], key_column_num, key_expr_type, const_value, const_type)) + { + key_arg_pos = 1; + is_constant_transformed = true; + } + else + return false; + + if (key_column_num == static_cast(-1)) + throw Exception("`key_column_num` wasn't initialized. It is a bug.", ErrorCodes::LOGICAL_ERROR); + + /// Transformed constant must weaken the condition, for example "x > 5" must weaken to "round(x) >= 5" + if (is_constant_transformed) + { + if (func_name == "less") + func_name = "lessOrEquals"; + else if (func_name == "greater") + func_name = "greaterOrEquals"; + } + + /// Replace on to <-sign> + if (key_arg_pos == 1) + { + if (func_name == "less") + func_name = "greater"; + else if (func_name == "greater") + func_name = "less"; + else if (func_name == "greaterOrEquals") + func_name = "lessOrEquals"; + else if (func_name == "lessOrEquals") + func_name = "greaterOrEquals"; + else if (func_name == "in" || func_name == "notIn" || func_name == "like") + { + /// "const IN data_column" doesn't make sense (unlike "data_column IN const") + return false; + } + } + + bool cast_not_needed = + is_set_const /// Set args are already casted inside Set::createFromAST + || (isNativeNumber(key_expr_type) && isNativeNumber(const_type)); /// Numbers are accurately compared without cast. + + if (!cast_not_needed) + castValueToType(key_expr_type, const_value, const_type, node); } else return false; - if (key_column_num == static_cast(-1)) - throw Exception("`key_column_num` wasn't initialized. It is a bug.", ErrorCodes::LOGICAL_ERROR); - - std::string func_name = func->name; - - /// Transformed constant must weaken the condition, for example "x > 5" must weaken to "round(x) >= 5" - if (is_constant_transformed) - { - if (func_name == "less") - func_name = "lessOrEquals"; - else if (func_name == "greater") - func_name = "greaterOrEquals"; - } - - /// Replace on to <-sign> - if (key_arg_pos == 1) - { - if (func_name == "less") - func_name = "greater"; - else if (func_name == "greater") - func_name = "less"; - else if (func_name == "greaterOrEquals") - func_name = "lessOrEquals"; - else if (func_name == "lessOrEquals") - func_name = "greaterOrEquals"; - else if (func_name == "in" || func_name == "notIn" || func_name == "like") - { - /// "const IN data_column" doesn't make sense (unlike "data_column IN const") - return false; - } - } - - out.key_column = key_column_num; - out.monotonic_functions_chain = std::move(chain); - const auto atom_it = atom_map.find(func_name); if (atom_it == std::end(atom_map)) return false; - bool cast_not_needed = - is_set_const /// Set args are already casted inside Set::createFromAST - || (isNativeNumber(key_expr_type) && isNativeNumber(const_type)); /// Numbers are accurately compared without cast. - - if (!cast_not_needed) - castValueToType(key_expr_type, const_value, const_type, node); + out.key_column = key_column_num; + out.monotonic_functions_chain = std::move(chain); return atom_it->second(out, const_value); } @@ -748,7 +797,6 @@ bool KeyCondition::atomFromAST(const ASTPtr & node, const Context & context, Blo return true; } } - return false; } diff --git a/dbms/src/Storages/MergeTree/MergeTreeIndexFullText.cpp b/dbms/src/Storages/MergeTree/MergeTreeIndexFullText.cpp index 895764339e5..8a5b942effd 100644 --- a/dbms/src/Storages/MergeTree/MergeTreeIndexFullText.cpp +++ b/dbms/src/Storages/MergeTree/MergeTreeIndexFullText.cpp @@ -142,7 +142,8 @@ const MergeTreeConditionFullText::AtomMap MergeTreeConditionFullText::atom_map "like", [] (RPNElement & out, const Field & value, const MergeTreeIndexFullText & idx) { - out.function = RPNElement::FUNCTION_LIKE; + std::cerr << "FULLTEXT INDEX IS USED FOR LIKE FUNCTION" << '\n'; + out.function = RPNElement::FUNCTION_EQUALS; out.bloom_filter = std::make_unique( idx.bloom_filter_size, idx.bloom_filter_hashes, idx.seed); @@ -151,6 +152,92 @@ const MergeTreeConditionFullText::AtomMap MergeTreeConditionFullText::atom_map return true; } }, + { + "notLike", + [] (RPNElement & out, const Field & value, const MergeTreeIndexFullText & idx) + { + std::cerr << "FULLTEXT INDEX IS USED FOR NOT_LIKE FUNCTION" << '\n'; + out.function = RPNElement::FUNCTION_NOT_EQUALS; + out.bloom_filter = std::make_unique( + idx.bloom_filter_size, idx.bloom_filter_hashes, idx.seed); + + const auto & str = value.get(); + likeStringToBloomFilter(str, idx.token_extractor_func, *out.bloom_filter); + return true; + } + }, + { + "startsWith", + [] (RPNElement & out, const Field & value, const MergeTreeIndexFullText & idx) + { + out.function = RPNElement::FUNCTION_EQUALS; + out.bloom_filter = std::make_unique( + idx.bloom_filter_size, idx.bloom_filter_hashes, idx.seed); + + const auto & prefix = value.get(); + stringToBloomFilter(prefix.c_str(), prefix.size(), idx.token_extractor_func, *out.bloom_filter); + return true; + } + }, + { + "endsWith", + [] (RPNElement & out, const Field & value, const MergeTreeIndexFullText & idx) + { + out.function = RPNElement::FUNCTION_EQUALS; + out.bloom_filter = std::make_unique( + idx.bloom_filter_size, idx.bloom_filter_hashes, idx.seed); + + const auto & suffix = value.get(); + stringToBloomFilter(suffix.c_str(), suffix.size(), idx.token_extractor_func, *out.bloom_filter); + return true; + } + }, + { + "multiSearchAny", + [] (RPNElement & out, const Field & value, const MergeTreeIndexFullText & idx) + { + out.function = RPNElement::FUNCTION_EQUALS; + out.bloom_filter = std::make_unique( + idx.bloom_filter_size, idx.bloom_filter_hashes, idx.seed); + for (const auto & element : value.get()) + { + if (element.getType() != Field::Types::String) + return false; + + const auto & str = element.get(); + stringToBloomFilter(str.c_str(), str.size(), idx.token_extractor_func, *out.bloom_filter); + } + return true; + } + }, + { + "empty", + [] (RPNElement & out, const Field &, const MergeTreeIndexFullText & idx) + { + out.function = RPNElement::FUNCTION_EQUALS; + out.bloom_filter = std::make_unique( + idx.bloom_filter_size, idx.bloom_filter_hashes, idx.seed); + + std::string empty_str; + stringToBloomFilter(empty_str.c_str(), empty_str.size(), idx.token_extractor_func, *out.bloom_filter); + return true; + } + + }, + { + "notEmpty", + [] (RPNElement & out, const Field &, const MergeTreeIndexFullText & idx) + { + out.function = RPNElement::FUNCTION_NOT_EQUALS; + out.bloom_filter = std::make_unique( + idx.bloom_filter_size, idx.bloom_filter_hashes, idx.seed); + + std::string empty_str; + stringToBloomFilter(empty_str.c_str(), empty_str.size(), idx.token_extractor_func, *out.bloom_filter); + return true; + } + + }, { "notIn", [] (RPNElement & out, const Field &, const MergeTreeIndexFullText &) @@ -197,8 +284,6 @@ bool MergeTreeConditionFullText::alwaysUnknownOrTrue() const } else if (element.function == RPNElement::FUNCTION_EQUALS || element.function == RPNElement::FUNCTION_NOT_EQUALS - || element.function == RPNElement::FUNCTION_LIKE - || element.function == RPNElement::FUNCTION_NOT_LIKE || element.function == RPNElement::FUNCTION_IN || element.function == RPNElement::FUNCTION_NOT_IN || element.function == RPNElement::ALWAYS_FALSE) @@ -255,15 +340,6 @@ bool MergeTreeConditionFullText::mayBeTrueOnGranule(MergeTreeIndexGranulePtr idx if (element.function == RPNElement::FUNCTION_NOT_EQUALS) rpn_stack.back() = !rpn_stack.back(); } - else if (element.function == RPNElement::FUNCTION_LIKE - || element.function == RPNElement::FUNCTION_NOT_LIKE) - { - rpn_stack.emplace_back( - granule->bloom_filters[element.key_column].contains(*element.bloom_filter), true); - - if (element.function == RPNElement::FUNCTION_NOT_LIKE) - rpn_stack.back() = !rpn_stack.back(); - } else if (element.function == RPNElement::FUNCTION_IN || element.function == RPNElement::FUNCTION_NOT_IN) { @@ -338,42 +414,50 @@ bool MergeTreeConditionFullText::atomFromAST( { const ASTs & args = typeid_cast(*func->arguments).children; - if (args.size() != 2) - return false; - - size_t key_arg_pos; /// Position of argument with key column (non-const argument) size_t key_column_num = -1; /// Number of a key column (inside key_column_names array) + std::string func_name = func->name; - if (functionIsInOrGlobalInOperator(func->name) && tryPrepareSetBloomFilter(args, out)) + if (args.size() == 1) { - key_arg_pos = 0; + if (!getKey(args[0], key_column_num)) + return false; } - else if (KeyCondition::getConstant(args[1], block_with_constants, const_value, const_type) && getKey(args[0], key_column_num)) + else if (args.size() == 2) { - key_arg_pos = 0; - } - else if (KeyCondition::getConstant(args[0], block_with_constants, const_value, const_type) && getKey(args[1], key_column_num)) - { - key_arg_pos = 1; + + size_t key_arg_pos; /// Position of argument with key column (non-const argument) + + if (functionIsInOrGlobalInOperator(func->name) && tryPrepareSetBloomFilter(args, out)) { + key_arg_pos = 0; + } else if (KeyCondition::getConstant(args[1], block_with_constants, const_value, const_type) && + getKey(args[0], key_column_num)) { + key_arg_pos = 0; + } else if (KeyCondition::getConstant(args[0], block_with_constants, const_value, const_type) && + getKey(args[1], key_column_num)) { + key_arg_pos = 1; + } else + return false; + + if (const_type && const_type->getTypeId() != TypeIndex::String && + const_type->getTypeId() != TypeIndex::FixedString && + const_type->getTypeId() != TypeIndex::Array) + return false; + + if (key_arg_pos == 1 && (func_name != "equals" || func_name != "notEquals")) + return false; + + else if (!index.token_extractor_func->supportLike() && (func_name == "like" || func_name == "notLike")) + return false; } else return false; - if (const_type && const_type->getTypeId() != TypeIndex::String && const_type->getTypeId() != TypeIndex::FixedString) - return false; - - if (key_arg_pos == 1 && (func->name != "equals" || func->name != "notEquals")) - return false; - else if (!index.token_extractor_func->supportLike() && (func->name == "like" || func->name == "notLike")) - return false; - else - key_arg_pos = 0; - - const auto atom_it = atom_map.find(func->name); + const auto atom_it = atom_map.find(func_name); if (atom_it == std::end(atom_map)) return false; out.key_column = key_column_num; + return atom_it->second(out, const_value, index); } else if (KeyCondition::getConstant(node, block_with_constants, const_value, const_type)) diff --git a/dbms/src/Storages/MergeTree/MergeTreeIndexFullText.h b/dbms/src/Storages/MergeTree/MergeTreeIndexFullText.h index cd8ac534e64..7cb5bf9e71b 100644 --- a/dbms/src/Storages/MergeTree/MergeTreeIndexFullText.h +++ b/dbms/src/Storages/MergeTree/MergeTreeIndexFullText.h @@ -78,8 +78,6 @@ private: /// Atoms of a Boolean expression. FUNCTION_EQUALS, FUNCTION_NOT_EQUALS, - FUNCTION_LIKE, - FUNCTION_NOT_LIKE, FUNCTION_IN, FUNCTION_NOT_IN, FUNCTION_UNKNOWN, /// Can take any value. @@ -97,7 +95,7 @@ private: : function(function_), key_column(key_column_), bloom_filter(std::move(const_bloom_filter_)) {} Function function = FUNCTION_UNKNOWN; - /// For FUNCTION_EQUALS, FUNCTION_NOT_EQUALS, FUNCTION_LIKE, FUNCTION_NOT_LIKE. + /// For FUNCTION_EQUALS, FUNCTION_NOT_EQUALS size_t key_column; std::unique_ptr bloom_filter; /// For FUNCTION_IN and FUNCTION_NOT_IN From 88778b0f5b1c6a1bb108fd96de4d0c19a1f0ab7d Mon Sep 17 00:00:00 2001 From: dimarub2000 Date: Wed, 10 Jul 2019 18:12:02 +0300 Subject: [PATCH 02/26] minor style changes --- dbms/src/Storages/MergeTree/MergeTreeIndexFullText.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/dbms/src/Storages/MergeTree/MergeTreeIndexFullText.cpp b/dbms/src/Storages/MergeTree/MergeTreeIndexFullText.cpp index 8a5b942effd..3f7ce1edd70 100644 --- a/dbms/src/Storages/MergeTree/MergeTreeIndexFullText.cpp +++ b/dbms/src/Storages/MergeTree/MergeTreeIndexFullText.cpp @@ -142,7 +142,6 @@ const MergeTreeConditionFullText::AtomMap MergeTreeConditionFullText::atom_map "like", [] (RPNElement & out, const Field & value, const MergeTreeIndexFullText & idx) { - std::cerr << "FULLTEXT INDEX IS USED FOR LIKE FUNCTION" << '\n'; out.function = RPNElement::FUNCTION_EQUALS; out.bloom_filter = std::make_unique( idx.bloom_filter_size, idx.bloom_filter_hashes, idx.seed); @@ -156,7 +155,6 @@ const MergeTreeConditionFullText::AtomMap MergeTreeConditionFullText::atom_map "notLike", [] (RPNElement & out, const Field & value, const MergeTreeIndexFullText & idx) { - std::cerr << "FULLTEXT INDEX IS USED FOR NOT_LIKE FUNCTION" << '\n'; out.function = RPNElement::FUNCTION_NOT_EQUALS; out.bloom_filter = std::make_unique( idx.bloom_filter_size, idx.bloom_filter_hashes, idx.seed); From 1480b8e62cb472840209904355bac5ee8a920686 Mon Sep 17 00:00:00 2001 From: dimarub2000 Date: Thu, 11 Jul 2019 15:31:24 +0300 Subject: [PATCH 03/26] Actually set index already woks for everything. Here are some tests. --- .../Storages/MergeTree/MergeTreeIndexSet.cpp | 5 +- ...00965_set_index_string_functions.reference | 16 +++++ .../00965_set_index_string_functions.sh | 66 +++++++++++++++++++ 3 files changed, 86 insertions(+), 1 deletion(-) create mode 100644 dbms/tests/queries/0_stateless/00965_set_index_string_functions.reference create mode 100755 dbms/tests/queries/0_stateless/00965_set_index_string_functions.sh diff --git a/dbms/src/Storages/MergeTree/MergeTreeIndexSet.cpp b/dbms/src/Storages/MergeTree/MergeTreeIndexSet.cpp index 9527e6a0a67..fc3b905d1eb 100644 --- a/dbms/src/Storages/MergeTree/MergeTreeIndexSet.cpp +++ b/dbms/src/Storages/MergeTree/MergeTreeIndexSet.cpp @@ -411,7 +411,10 @@ static bool checkAtomName(const String & name) "greaterOrEquals", "in", "notIn", - "like" + "like", + "startsWith", + "endsWith", + "multiSearchAny" }; return atoms.find(name) != atoms.end(); } diff --git a/dbms/tests/queries/0_stateless/00965_set_index_string_functions.reference b/dbms/tests/queries/0_stateless/00965_set_index_string_functions.reference new file mode 100644 index 00000000000..a9c9f97bdec --- /dev/null +++ b/dbms/tests/queries/0_stateless/00965_set_index_string_functions.reference @@ -0,0 +1,16 @@ +9 abra +14 abracadabra + "rows_read": 4, +9 abra +10 cadabra +11 crabacadabra +14 abracadabra +15 cadabraabra + "rows_read": 6, +9 abra +14 abracadabra + "rows_read": 4, +1 ClickHouse is a column-oriented database management system (DBMS) +2 column-oriented database management system +13 basement + "rows_read": 6, diff --git a/dbms/tests/queries/0_stateless/00965_set_index_string_functions.sh b/dbms/tests/queries/0_stateless/00965_set_index_string_functions.sh new file mode 100755 index 00000000000..9c6217a9d73 --- /dev/null +++ b/dbms/tests/queries/0_stateless/00965_set_index_string_functions.sh @@ -0,0 +1,66 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +. $CURDIR/../shell_config.sh + +$CLICKHOUSE_CLIENT --query="DROP TABLE IF EXISTS set_idx;" + +$CLICKHOUSE_CLIENT -n --query=" +SET allow_experimental_data_skipping_indices = 1; +CREATE TABLE set_idx +( + k UInt64, + s String, + INDEX idx (s) TYPE set(2) GRANULARITY 1 +) ENGINE = MergeTree() +ORDER BY k +SETTINGS index_granularity = 2;" + +$CLICKHOUSE_CLIENT --query="INSERT INTO set_idx VALUES +(0, 'ClickHouse - столбцовая система управления базами данных (СУБД)'), +(1, 'ClickHouse is a column-oriented database management system (DBMS)'), +(2, 'column-oriented database management system'), +(3, 'columns'), +(4, 'какая-то строка'), +(5, 'еще строка'), +(6, 'some string'), +(7, 'another string'), +(8, 'computer science'), +(9, 'abra'), +(10, 'cadabra'), +(11, 'crabacadabra'), +(12, 'crab'), +(13, 'basement'), +(14, 'abracadabra'), +(15, 'cadabraabra')" + +# STARTS_WITH +$CLICKHOUSE_CLIENT --query="SELECT * FROM set_idx WHERE startsWith(s, 'abra')" +$CLICKHOUSE_CLIENT --query="SELECT * FROM set_idx WHERE startsWith(s, 'abra') FORMAT JSON" | grep "rows_read" + +# ENDS_WITH +$CLICKHOUSE_CLIENT --query="SELECT * FROM set_idx WHERE endsWith(s, 'abra')" +$CLICKHOUSE_CLIENT --query="SELECT * FROM set_idx WHERE endsWith(s, 'abra') FORMAT JSON" | grep "rows_read" + +# COMBINED +$CLICKHOUSE_CLIENT --query="SELECT * FROM set_idx WHERE startsWith(s, 'abra') AND endsWith(s, 'abra')" +$CLICKHOUSE_CLIENT --query="SELECT * FROM set_idx WHERE startsWith(s, 'abra') AND endsWith(s, 'abra') FORMAT JSON" | grep "rows_read" + +# MULTY_SEARCH_ANY +$CLICKHOUSE_CLIENT --query="SELECT * FROM set_idx WHERE multiSearchAny(s, ['data', 'base'])" +$CLICKHOUSE_CLIENT --query="SELECT * FROM set_idx WHERE multiSearchAny(s, ['data', 'base']) FORMAT JSON" | grep "rows_read" + +$CLICKHOUSE_CLIENT --query="DROP TABLE set_idx;" + + + + + + + + + + + + + From e5d4f11c737db2792dda5b5e09a6844486b5f2c6 Mon Sep 17 00:00:00 2001 From: dimarub2000 Date: Thu, 11 Jul 2019 16:44:44 +0300 Subject: [PATCH 04/26] Deleted some extra implementations, fixed style. multiSearchAny still not working. --- .../MergeTree/MergeTreeIndexFullText.cpp | 88 ++++++------------- 1 file changed, 26 insertions(+), 62 deletions(-) diff --git a/dbms/src/Storages/MergeTree/MergeTreeIndexFullText.cpp b/dbms/src/Storages/MergeTree/MergeTreeIndexFullText.cpp index 3f7ce1edd70..6a4af7a56c6 100644 --- a/dbms/src/Storages/MergeTree/MergeTreeIndexFullText.cpp +++ b/dbms/src/Storages/MergeTree/MergeTreeIndexFullText.cpp @@ -208,34 +208,6 @@ const MergeTreeConditionFullText::AtomMap MergeTreeConditionFullText::atom_map return true; } }, - { - "empty", - [] (RPNElement & out, const Field &, const MergeTreeIndexFullText & idx) - { - out.function = RPNElement::FUNCTION_EQUALS; - out.bloom_filter = std::make_unique( - idx.bloom_filter_size, idx.bloom_filter_hashes, idx.seed); - - std::string empty_str; - stringToBloomFilter(empty_str.c_str(), empty_str.size(), idx.token_extractor_func, *out.bloom_filter); - return true; - } - - }, - { - "notEmpty", - [] (RPNElement & out, const Field &, const MergeTreeIndexFullText & idx) - { - out.function = RPNElement::FUNCTION_NOT_EQUALS; - out.bloom_filter = std::make_unique( - idx.bloom_filter_size, idx.bloom_filter_hashes, idx.seed); - - std::string empty_str; - stringToBloomFilter(empty_str.c_str(), empty_str.size(), idx.token_extractor_func, *out.bloom_filter); - return true; - } - - }, { "notIn", [] (RPNElement & out, const Field &, const MergeTreeIndexFullText &) @@ -404,7 +376,7 @@ bool MergeTreeConditionFullText::getKey(const ASTPtr & node, size_t & key_column } bool MergeTreeConditionFullText::atomFromAST( - const ASTPtr & node, Block & block_with_constants, RPNElement & out) + const ASTPtr & node, Block & block_with_constants, RPNElement & out) { Field const_value; DataTypePtr const_type; @@ -412,58 +384,50 @@ bool MergeTreeConditionFullText::atomFromAST( { const ASTs & args = typeid_cast(*func->arguments).children; + if (args.size() != 2) + return false; + + size_t key_arg_pos; /// Position of argument with key column (non-const argument) size_t key_column_num = -1; /// Number of a key column (inside key_column_names array) - std::string func_name = func->name; - if (args.size() == 1) + if (functionIsInOrGlobalInOperator(func->name) && tryPrepareSetBloomFilter(args, out)) { - if (!getKey(args[0], key_column_num)) - return false; + key_arg_pos = 0; } - else if (args.size() == 2) + else if (KeyCondition::getConstant(args[1], block_with_constants, const_value, const_type) && getKey(args[0], key_column_num)) { - - size_t key_arg_pos; /// Position of argument with key column (non-const argument) - - if (functionIsInOrGlobalInOperator(func->name) && tryPrepareSetBloomFilter(args, out)) { - key_arg_pos = 0; - } else if (KeyCondition::getConstant(args[1], block_with_constants, const_value, const_type) && - getKey(args[0], key_column_num)) { - key_arg_pos = 0; - } else if (KeyCondition::getConstant(args[0], block_with_constants, const_value, const_type) && - getKey(args[1], key_column_num)) { - key_arg_pos = 1; - } else - return false; - - if (const_type && const_type->getTypeId() != TypeIndex::String && - const_type->getTypeId() != TypeIndex::FixedString && - const_type->getTypeId() != TypeIndex::Array) - return false; - - if (key_arg_pos == 1 && (func_name != "equals" || func_name != "notEquals")) - return false; - - else if (!index.token_extractor_func->supportLike() && (func_name == "like" || func_name == "notLike")) - return false; + key_arg_pos = 0; + } + else if (KeyCondition::getConstant(args[0], block_with_constants, const_value, const_type) && getKey(args[1], key_column_num)) + { + key_arg_pos = 1; } else return false; - const auto atom_it = atom_map.find(func_name); + if (const_type && const_type->getTypeId() != TypeIndex::String + && const_type->getTypeId() != TypeIndex::FixedString + && const_type->getTypeId() != TypeIndex::Array) + return false; + + if (key_arg_pos == 1 && (func->name != "equals" || func->name != "notEquals")) + return false; + else if (!index.token_extractor_func->supportLike() && (func->name == "like" || func->name == "notLike")) + return false; + + const auto atom_it = atom_map.find(func->name); if (atom_it == std::end(atom_map)) return false; out.key_column = key_column_num; - return atom_it->second(out, const_value, index); } else if (KeyCondition::getConstant(node, block_with_constants, const_value, const_type)) { /// Check constant like in KeyCondition if (const_value.getType() == Field::Types::UInt64 - || const_value.getType() == Field::Types::Int64 - || const_value.getType() == Field::Types::Float64) + || const_value.getType() == Field::Types::Int64 + || const_value.getType() == Field::Types::Float64) { /// Zero in all types is represented in memory the same way as in UInt64. out.function = const_value.get() From 4005987fc4219d1fab874a0904aa188041a67a95 Mon Sep 17 00:00:00 2001 From: dimarub2000 Date: Fri, 12 Jul 2019 14:35:17 +0300 Subject: [PATCH 05/26] multiSearchAny fixed, tests added, minor syle changes --- .../MergeTree/MergeTreeIndexFullText.cpp | 38 +++++--- .../MergeTree/MergeTreeIndexFullText.h | 14 ++- ...964_bloom_index_string_functions.reference | 53 ++++++++++++ .../00964_bloom_index_string_functions.sh | 86 +++++++++++++++++++ .../00965_set_index_string_functions.sh | 13 --- 5 files changed, 176 insertions(+), 28 deletions(-) create mode 100644 dbms/tests/queries/0_stateless/00964_bloom_index_string_functions.reference create mode 100755 dbms/tests/queries/0_stateless/00964_bloom_index_string_functions.sh diff --git a/dbms/src/Storages/MergeTree/MergeTreeIndexFullText.cpp b/dbms/src/Storages/MergeTree/MergeTreeIndexFullText.cpp index 6a4af7a56c6..43a507ae902 100644 --- a/dbms/src/Storages/MergeTree/MergeTreeIndexFullText.cpp +++ b/dbms/src/Storages/MergeTree/MergeTreeIndexFullText.cpp @@ -194,17 +194,20 @@ const MergeTreeConditionFullText::AtomMap MergeTreeConditionFullText::atom_map "multiSearchAny", [] (RPNElement & out, const Field & value, const MergeTreeIndexFullText & idx) { - out.function = RPNElement::FUNCTION_EQUALS; - out.bloom_filter = std::make_unique( - idx.bloom_filter_size, idx.bloom_filter_hashes, idx.seed); + out.function = RPNElement::FUNCTION_MULTI_SEARCH; + + std::vector> bloom_filters; + bloom_filters.emplace_back(); for (const auto & element : value.get()) { if (element.getType() != Field::Types::String) return false; + bloom_filters.back().emplace_back(idx.bloom_filter_size, idx.bloom_filter_hashes, idx.seed); const auto & str = element.get(); - stringToBloomFilter(str.c_str(), str.size(), idx.token_extractor_func, *out.bloom_filter); + stringToBloomFilter(str.c_str(), str.size(), idx.token_extractor_func, bloom_filters.back().back()); } + out.set_bloom_filters = std::move(bloom_filters); return true; } }, @@ -256,6 +259,7 @@ bool MergeTreeConditionFullText::alwaysUnknownOrTrue() const || element.function == RPNElement::FUNCTION_NOT_EQUALS || element.function == RPNElement::FUNCTION_IN || element.function == RPNElement::FUNCTION_NOT_IN + || element.function == RPNElement::FUNCTION_MULTI_SEARCH || element.function == RPNElement::ALWAYS_FALSE) { rpn_stack.push_back(false); @@ -311,7 +315,7 @@ bool MergeTreeConditionFullText::mayBeTrueOnGranule(MergeTreeIndexGranulePtr idx rpn_stack.back() = !rpn_stack.back(); } else if (element.function == RPNElement::FUNCTION_IN - || element.function == RPNElement::FUNCTION_NOT_IN) + || element.function == RPNElement::FUNCTION_NOT_IN) { std::vector result(element.set_bloom_filters.back().size(), true); @@ -329,6 +333,18 @@ bool MergeTreeConditionFullText::mayBeTrueOnGranule(MergeTreeIndexGranulePtr idx if (element.function == RPNElement::FUNCTION_NOT_IN) rpn_stack.back() = !rpn_stack.back(); } + else if (element.function == RPNElement::FUNCTION_MULTI_SEARCH) + { + std::vector result(element.set_bloom_filters.back().size(), true); + + const auto & bloom_filters = element.set_bloom_filters[0]; + + for (size_t row = 0; row < bloom_filters.size(); ++row) + result[row] = result[row] && granule->bloom_filters[element.key_column].contains(bloom_filters[row]); + + rpn_stack.emplace_back( + std::find(std::cbegin(result), std::cend(result), true) != std::end(result), true); + } else if (element.function == RPNElement::FUNCTION_NOT) { rpn_stack.back() = !rpn_stack.back(); @@ -376,7 +392,7 @@ bool MergeTreeConditionFullText::getKey(const ASTPtr & node, size_t & key_column } bool MergeTreeConditionFullText::atomFromAST( - const ASTPtr & node, Block & block_with_constants, RPNElement & out) + const ASTPtr & node, Block & block_with_constants, RPNElement & out) { Field const_value; DataTypePtr const_type; @@ -389,8 +405,9 @@ bool MergeTreeConditionFullText::atomFromAST( size_t key_arg_pos; /// Position of argument with key column (non-const argument) size_t key_column_num = -1; /// Number of a key column (inside key_column_names array) + std::string func_name = func->name; - if (functionIsInOrGlobalInOperator(func->name) && tryPrepareSetBloomFilter(args, out)) + if (functionIsInOrGlobalInOperator(func_name) && tryPrepareSetBloomFilter(args, out)) { key_arg_pos = 0; } @@ -410,12 +427,12 @@ bool MergeTreeConditionFullText::atomFromAST( && const_type->getTypeId() != TypeIndex::Array) return false; - if (key_arg_pos == 1 && (func->name != "equals" || func->name != "notEquals")) + if (key_arg_pos == 1 && (func_name != "equals" || func_name != "notEquals")) return false; - else if (!index.token_extractor_func->supportLike() && (func->name == "like" || func->name == "notLike")) + else if (!index.token_extractor_func->supportLike() && (func_name == "like" || func_name == "notLike")) return false; - const auto atom_it = atom_map.find(func->name); + const auto atom_it = atom_map.find(func_name); if (atom_it == std::end(atom_map)) return false; @@ -521,7 +538,6 @@ bool MergeTreeConditionFullText::tryPrepareSetBloomFilter( return true; } - MergeTreeIndexGranulePtr MergeTreeIndexFullText::createIndexGranule() const { return std::make_shared(*this); diff --git a/dbms/src/Storages/MergeTree/MergeTreeIndexFullText.h b/dbms/src/Storages/MergeTree/MergeTreeIndexFullText.h index 7cb5bf9e71b..af22b83272f 100644 --- a/dbms/src/Storages/MergeTree/MergeTreeIndexFullText.h +++ b/dbms/src/Storages/MergeTree/MergeTreeIndexFullText.h @@ -80,6 +80,7 @@ private: FUNCTION_NOT_EQUALS, FUNCTION_IN, FUNCTION_NOT_IN, + FUNCTION_MULTI_SEARCH, FUNCTION_UNKNOWN, /// Can take any value. /// Operators of the logical expression. FUNCTION_NOT, @@ -91,15 +92,20 @@ private: }; RPNElement( - Function function_ = FUNCTION_UNKNOWN, size_t key_column_ = 0, std::unique_ptr && const_bloom_filter_ = nullptr) - : function(function_), key_column(key_column_), bloom_filter(std::move(const_bloom_filter_)) {} + Function function_ = FUNCTION_UNKNOWN, size_t key_column_ = 0, std::unique_ptr && const_bloom_filter_ = nullptr) + : function(function_), key_column(key_column_), bloom_filter(std::move(const_bloom_filter_)) {} Function function = FUNCTION_UNKNOWN; - /// For FUNCTION_EQUALS, FUNCTION_NOT_EQUALS + /// For FUNCTION_EQUALS, FUNCTION_NOT_EQUALS and FUNCTION_MULTI_SEARCH size_t key_column; + + /// For FUNCTION_EQUALS, FUNCTION_NOT_EQUALS std::unique_ptr bloom_filter; - /// For FUNCTION_IN and FUNCTION_NOT_IN + + /// For FUNCTION_IN, FUNCTION_NOT_IN and FUNCTION_MULTI_SEARCH std::vector> set_bloom_filters; + + /// For FUNCTION_IN and FUNCTION_NOT_IN std::vector set_key_position; }; diff --git a/dbms/tests/queries/0_stateless/00964_bloom_index_string_functions.reference b/dbms/tests/queries/0_stateless/00964_bloom_index_string_functions.reference new file mode 100644 index 00000000000..4a8c828426e --- /dev/null +++ b/dbms/tests/queries/0_stateless/00964_bloom_index_string_functions.reference @@ -0,0 +1,53 @@ +9 abra +14 abracadabra + "rows_read": 6, +8 computer science + "rows_read": 2, +9 abra +10 cadabra +11 crabacadabra +14 abracadabra +15 cadabraabra + "rows_read": 6, +6 some string +7 another string + "rows_read": 2, +9 abra +14 abracadabra + "rows_read": 6, +8 computer science + "rows_read": 2, +1 ClickHouse is a column-oriented database management system (DBMS) +2 column-oriented database management system +13 basement + "rows_read": 6, +6 some string +7 another string + "rows_read": 2, +6 some string +7 another string +8 computer science + "rows_read": 4, +1 ClickHouse is a column-oriented database management system (DBMS) +2 column-oriented database management system +13 basement + "rows_read": 6, +9 abra +10 cadabra +11 crabacadabra +14 abracadabra +15 cadabraabra + "rows_read": 6, +4 какая-то строка +5 еще строка +6 some string +7 another string + "rows_read": 4, +14 abracadabra + "rows_read": 4, +1 ClickHouse is a column-oriented database management system (DBMS) +2 column-oriented database management system +10 cadabra +11 crabacadabra +15 cadabraabra + "rows_read": 8, diff --git a/dbms/tests/queries/0_stateless/00964_bloom_index_string_functions.sh b/dbms/tests/queries/0_stateless/00964_bloom_index_string_functions.sh new file mode 100755 index 00000000000..28120e782c1 --- /dev/null +++ b/dbms/tests/queries/0_stateless/00964_bloom_index_string_functions.sh @@ -0,0 +1,86 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +. $CURDIR/../shell_config.sh + +$CLICKHOUSE_CLIENT --query="DROP TABLE IF EXISTS bloom_filter_idx;" + +# NGRAM BF +$CLICKHOUSE_CLIENT -n --query=" +SET allow_experimental_data_skipping_indices = 1; +CREATE TABLE bloom_filter_idx +( + k UInt64, + s String, + INDEX bf (s, lower(s)) TYPE ngrambf_v1(3, 512, 2, 0) GRANULARITY 1 +) ENGINE = MergeTree() +ORDER BY k +SETTINGS index_granularity = 2;" + +$CLICKHOUSE_CLIENT --query="INSERT INTO bloom_filter_idx VALUES +(0, 'ClickHouse - столбцовая система управления базами данных (СУБД)'), +(1, 'ClickHouse is a column-oriented database management system (DBMS)'), +(2, 'column-oriented database management system'), +(3, 'columns'), +(4, 'какая-то строка'), +(5, 'еще строка'), +(6, 'some string'), +(7, 'another string'), +(8, 'computer science'), +(9, 'abra'), +(10, 'cadabra'), +(11, 'crabacadabra'), +(12, 'crab'), +(13, 'basement'), +(14, 'abracadabra'), +(15, 'cadabraabra')" + +# STARTS_WITH +$CLICKHOUSE_CLIENT --query="SELECT * FROM bloom_filter_idx WHERE startsWith(s, 'abra') ORDER BY k" +$CLICKHOUSE_CLIENT --query="SELECT * FROM bloom_filter_idx WHERE startsWith(s, 'abra') ORDER BY k FORMAT JSON" | grep "rows_read" + +$CLICKHOUSE_CLIENT --query="SELECT * FROM bloom_filter_idx WHERE startsWith(s, 'computer') ORDER BY k" +$CLICKHOUSE_CLIENT --query="SELECT * FROM bloom_filter_idx WHERE startsWith(s, 'computer') ORDER BY k FORMAT JSON" | grep "rows_read" + +# ENDS_WITH +$CLICKHOUSE_CLIENT --query="SELECT * FROM bloom_filter_idx WHERE endsWith(s, 'abra') ORDER BY k" +$CLICKHOUSE_CLIENT --query="SELECT * FROM bloom_filter_idx WHERE endsWith(s, 'abra') ORDER BY k FORMAT JSON" | grep "rows_read" + +$CLICKHOUSE_CLIENT --query="SELECT * FROM bloom_filter_idx WHERE endsWith(s, 'ring') ORDER BY k" +$CLICKHOUSE_CLIENT --query="SELECT * FROM bloom_filter_idx WHERE endsWith(s, 'ring') ORDER BY k FORMAT JSON" | grep "rows_read" + +# COMBINED +$CLICKHOUSE_CLIENT --query="SELECT * FROM bloom_filter_idx WHERE startsWith(s, 'abra') AND endsWith(s, 'abra')" +$CLICKHOUSE_CLIENT --query="SELECT * FROM bloom_filter_idx WHERE startsWith(s, 'abra') AND endsWith(s, 'abra') FORMAT JSON" | grep "rows_read" + +$CLICKHOUSE_CLIENT --query="SELECT * FROM bloom_filter_idx WHERE startsWith(s, 'c') AND endsWith(s, 'science')" +$CLICKHOUSE_CLIENT --query="SELECT * FROM bloom_filter_idx WHERE startsWith(s, 'c') AND endsWith(s, 'science') FORMAT JSON" | grep "rows_read" + +# MULTY_SEARCH_ANY +$CLICKHOUSE_CLIENT --query="SELECT * FROM bloom_filter_idx WHERE multiSearchAny(s, ['data', 'base'])" +$CLICKHOUSE_CLIENT --query="SELECT * FROM bloom_filter_idx WHERE multiSearchAny(s, ['data', 'base']) FORMAT JSON" | grep "rows_read" + +$CLICKHOUSE_CLIENT --query="SELECT * FROM bloom_filter_idx WHERE multiSearchAny(s, ['string'])" +$CLICKHOUSE_CLIENT --query="SELECT * FROM bloom_filter_idx WHERE multiSearchAny(s, ['string']) FORMAT JSON" | grep "rows_read" + +$CLICKHOUSE_CLIENT --query="SELECT * FROM bloom_filter_idx WHERE multiSearchAny(s, ['string', 'computer'])" +$CLICKHOUSE_CLIENT --query="SELECT * FROM bloom_filter_idx WHERE multiSearchAny(s, ['string', 'computer']) FORMAT JSON" | grep "rows_read" + +$CLICKHOUSE_CLIENT --query="SELECT * FROM bloom_filter_idx WHERE multiSearchAny(s, ['base', 'seme', 'gement'])" +$CLICKHOUSE_CLIENT --query="SELECT * FROM bloom_filter_idx WHERE multiSearchAny(s, ['base', 'seme', 'gement']) FORMAT JSON" | grep "rows_read" + +$CLICKHOUSE_CLIENT --query="SELECT * FROM bloom_filter_idx WHERE multiSearchAny(s, ['abra', 'cadabra', 'cab', 'extra'])" +$CLICKHOUSE_CLIENT --query="SELECT * FROM bloom_filter_idx WHERE multiSearchAny(s, ['abra', 'cadabra', 'cab', 'extra']) FORMAT JSON" | grep "rows_read" + +$CLICKHOUSE_CLIENT --query="SELECT * FROM bloom_filter_idx WHERE multiSearchAny(s, ['строка', 'string'])" +$CLICKHOUSE_CLIENT --query="SELECT * FROM bloom_filter_idx WHERE multiSearchAny(s, ['строка', 'string']) FORMAT JSON" | grep "rows_read" + +# MULTY_SEARCH_ANY + OTHER + +$CLICKHOUSE_CLIENT --query="SELECT * FROM bloom_filter_idx WHERE multiSearchAny(s, ['adab', 'cad', 'aba']) AND startsWith(s, 'abra')" +$CLICKHOUSE_CLIENT --query="SELECT * FROM bloom_filter_idx WHERE multiSearchAny(s, ['adab', 'cad', 'aba']) AND startsWith(s, 'abra') FORMAT JSON" | grep "rows_read" + +$CLICKHOUSE_CLIENT --query="SELECT * FROM bloom_filter_idx WHERE multiSearchAny(s, ['adab', 'cad', 'aba']) AND (startsWith(s, 'c') OR startsWith(s, 'C'))" +$CLICKHOUSE_CLIENT --query="SELECT * FROM bloom_filter_idx WHERE multiSearchAny(s, ['adab', 'cad', 'aba']) AND (startsWith(s, 'c') OR startsWith(s, 'C')) FORMAT JSON" | grep "rows_read" + +$CLICKHOUSE_CLIENT --query="DROP TABLE bloom_filter_idx;" diff --git a/dbms/tests/queries/0_stateless/00965_set_index_string_functions.sh b/dbms/tests/queries/0_stateless/00965_set_index_string_functions.sh index 9c6217a9d73..056915c0e7c 100755 --- a/dbms/tests/queries/0_stateless/00965_set_index_string_functions.sh +++ b/dbms/tests/queries/0_stateless/00965_set_index_string_functions.sh @@ -51,16 +51,3 @@ $CLICKHOUSE_CLIENT --query="SELECT * FROM set_idx WHERE multiSearchAny(s, ['data $CLICKHOUSE_CLIENT --query="SELECT * FROM set_idx WHERE multiSearchAny(s, ['data', 'base']) FORMAT JSON" | grep "rows_read" $CLICKHOUSE_CLIENT --query="DROP TABLE set_idx;" - - - - - - - - - - - - - From 4d14f8c23076b43784bf65b3ed775d9c5c1b5ee0 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 15 Jul 2019 04:07:01 +0300 Subject: [PATCH 06/26] Fixed error in query_id printing --- dbms/src/Interpreters/ThreadStatusExt.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/dbms/src/Interpreters/ThreadStatusExt.cpp b/dbms/src/Interpreters/ThreadStatusExt.cpp index 757436edfb7..3667ff6c492 100644 --- a/dbms/src/Interpreters/ThreadStatusExt.cpp +++ b/dbms/src/Interpreters/ThreadStatusExt.cpp @@ -32,8 +32,6 @@ void ThreadStatus::attachQueryContext(Context & query_context_) if (!global_context) global_context = &query_context->getGlobalContext(); - query_id = query_context->getCurrentQueryId(); - if (thread_group) { std::lock_guard lock(thread_group->mutex); @@ -106,6 +104,9 @@ void ThreadStatus::attachQuery(const ThreadGroupStatusPtr & thread_group_, bool thread_group->thread_numbers.emplace_back(thread_number); } + if (query_context) + query_id = query_context->getCurrentQueryId(); + #if defined(__linux__) /// Set "nice" value if required. if (query_context) From 0f4075478e940f38390c3ef44c55d323aeefc61e Mon Sep 17 00:00:00 2001 From: Vasily Nemkov Date: Wed, 10 Jul 2019 10:28:47 +0300 Subject: [PATCH 07/26] Fixed DoubleDelta encoding cases for random Int32 and Int64. --- .../CompressionCodecDoubleDelta.cpp | 177 ++++++++++------- .../tests/gtest_compressionCodec.cpp | 182 +++++++++++++----- dbms/src/IO/BitHelpers.h | 1 - 3 files changed, 247 insertions(+), 113 deletions(-) diff --git a/dbms/src/Compression/CompressionCodecDoubleDelta.cpp b/dbms/src/Compression/CompressionCodecDoubleDelta.cpp index 80d363e0ec5..4e3f9eb9cd1 100644 --- a/dbms/src/Compression/CompressionCodecDoubleDelta.cpp +++ b/dbms/src/Compression/CompressionCodecDoubleDelta.cpp @@ -12,6 +12,7 @@ #include #include #include +#include namespace DB { @@ -24,28 +25,24 @@ extern const int CANNOT_DECOMPRESS; namespace { -UInt32 getDeltaTypeByteSize(UInt8 data_bytes_size) + +Int64 getMaxValueForByteSize(UInt8 byte_size) { - // both delta and double delta can be twice the size of data item, but not less than 32 bits and not more that 64. - return std::min(64/8, std::max(32/8, data_bytes_size * 2)); -} + switch (byte_size) + { + case sizeof(UInt8): + return std::numeric_limits::max(); + case sizeof(UInt16): + return std::numeric_limits::max(); + case sizeof(UInt32): + return std::numeric_limits::max(); + case sizeof(UInt64): + return std::numeric_limits::max(); + default: + assert(false && "only 1,2,4 and 8 data sizes are supported"); + } -UInt32 getCompressedHeaderSize(UInt8 data_bytes_size) -{ - const UInt8 items_count_size = 4; - - return items_count_size + data_bytes_size + getDeltaTypeByteSize(data_bytes_size); -} - -UInt32 getCompressedDataSize(UInt8 data_bytes_size, UInt32 uncompressed_size) -{ - const UInt32 items_count = uncompressed_size / data_bytes_size; - - // 11111 + max 64 bits of double delta. - const UInt32 max_item_size_bits = 5 + getDeltaTypeByteSize(data_bytes_size) * 8; - - // + 8 is to round up to next byte. - return (items_count * max_item_size_bits + 8) / 8; + return std::numeric_limits::max(); } struct WriteSpec @@ -55,8 +52,10 @@ struct WriteSpec const UInt8 data_bits; }; +const std::array DELTA_SIZES{7, 9, 12, 32, 64}; + template -WriteSpec getWriteSpec(const T & value) +WriteSpec getDeltaWriteSpec(const T & value) { if (value > -63 && value < 64) { @@ -80,27 +79,60 @@ WriteSpec getWriteSpec(const T & value) } } -template +WriteSpec getDeltaMaxWriteSpecByteSize(UInt8 data_bytes_size) +{ + return getDeltaWriteSpec(getMaxValueForByteSize(data_bytes_size)); +} + +UInt32 getCompressedHeaderSize(UInt8 data_bytes_size) +{ + const UInt8 items_count_size = 4; + const UInt8 first_delta_bytes_size = data_bytes_size; + + return items_count_size + data_bytes_size + first_delta_bytes_size; +} + +UInt32 getCompressedDataSize(UInt8 data_bytes_size, UInt32 uncompressed_size) +{ + const UInt32 items_count = uncompressed_size / data_bytes_size; + const auto double_delta_write_spec = getDeltaMaxWriteSpecByteSize(data_bytes_size); + + const UInt32 max_item_size_bits = double_delta_write_spec.prefix_bits + double_delta_write_spec.data_bits; + + // + 8 is to round up to next byte. + auto result = (items_count * max_item_size_bits + 7) / 8; + + return result; +} + +template UInt32 compressDataForType(const char * source, UInt32 source_size, char * dest) { - static_assert(std::is_unsigned_v && std::is_signed_v, "T must be unsigned, while DeltaType must be signed integer type."); - using UnsignedDeltaType = typename std::make_unsigned::type; + // Since only unsinged int has granted 2-compliment overflow handling, we are doing math here on unsigned types. + // To simplify and booletproof code, we operate enforce ValueType to be unsigned too. + static_assert(std::is_unsigned_v, "ValueType must be unsigned."); + using UnsignedDeltaType = ValueType; - if (source_size % sizeof(T) != 0) - throw Exception("Cannot compress, data size " + toString(source_size) + " is not aligned to " + toString(sizeof(T)), ErrorCodes::CANNOT_COMPRESS); + // We use signed delta type to turn huge unsigned values into smaller signed: + // ffffffff => -1 + using SignedDeltaType = typename std::make_signed::type; + + if (source_size % sizeof(ValueType) != 0) + throw Exception("Cannot compress, data size " + toString(source_size) + + " is not aligned to " + toString(sizeof(ValueType)), ErrorCodes::CANNOT_COMPRESS); const char * source_end = source + source_size; - const UInt32 items_count = source_size / sizeof(T); + const UInt32 items_count = source_size / sizeof(ValueType); unalignedStore(dest, items_count); dest += sizeof(items_count); - T prev_value{}; - DeltaType prev_delta{}; + ValueType prev_value{}; + UnsignedDeltaType prev_delta{}; if (source < source_end) { - prev_value = unalignedLoad(source); - unalignedStore(dest, prev_value); + prev_value = unalignedLoad(source); + unalignedStore(dest, prev_value); source += sizeof(prev_value); dest += sizeof(prev_value); @@ -108,24 +140,26 @@ UInt32 compressDataForType(const char * source, UInt32 source_size, char * dest) if (source < source_end) { - const T curr_value = unalignedLoad(source); - prev_delta = static_cast(curr_value - prev_value); - unalignedStore(dest, prev_delta); + const ValueType curr_value = unalignedLoad(source); + + prev_delta = curr_value - prev_value; + unalignedStore(dest, prev_delta); source += sizeof(curr_value); dest += sizeof(prev_delta); prev_value = curr_value; } - WriteBuffer buffer(dest, getCompressedDataSize(sizeof(T), source_size - sizeof(T)*2)); + WriteBuffer buffer(dest, getCompressedDataSize(sizeof(ValueType), source_size - sizeof(ValueType)*2)); BitWriter writer(buffer); - for (; source < source_end; source += sizeof(T)) + int item = 2; + for (; source < source_end; source += sizeof(ValueType), ++item) { - const T curr_value = unalignedLoad(source); + const ValueType curr_value = unalignedLoad(source); - const DeltaType delta = static_cast(curr_value - prev_value); - const DeltaType double_delta = delta - prev_delta; + const UnsignedDeltaType delta = curr_value - prev_value; + const UnsignedDeltaType double_delta = delta - prev_delta; prev_delta = delta; prev_value = curr_value; @@ -136,9 +170,11 @@ UInt32 compressDataForType(const char * source, UInt32 source_size, char * dest) } else { - const auto sign = std::signbit(double_delta); - const auto abs_value = static_cast(std::abs(double_delta)); - const auto write_spec = getWriteSpec(double_delta); + const SignedDeltaType signed_dd = static_cast(double_delta); + const auto sign = std::signbit(signed_dd); + // -1 shirnks dd down to fit into number of bits, and there can't be 0, so it is OK. + const auto abs_value = static_cast(std::abs(signed_dd) - 1); + const auto write_spec = getDeltaWriteSpec(signed_dd); writer.writeBits(write_spec.prefix_bits, write_spec.prefix); writer.writeBits(1, sign); @@ -151,22 +187,25 @@ UInt32 compressDataForType(const char * source, UInt32 source_size, char * dest) return sizeof(items_count) + sizeof(prev_value) + sizeof(prev_delta) + buffer.count(); } -template +template void decompressDataForType(const char * source, UInt32 source_size, char * dest) { - static_assert(std::is_unsigned_v && std::is_signed_v, "T must be unsigned, while DeltaType must be signed integer type."); + static_assert(std::is_unsigned_v, "ValueType must be unsigned."); + using UnsignedDeltaType = ValueType; + using SignedDeltaType = typename std::make_signed::type; + const char * source_end = source + source_size; const UInt32 items_count = unalignedLoad(source); source += sizeof(items_count); - T prev_value{}; - DeltaType prev_delta{}; + ValueType prev_value{}; + UnsignedDeltaType prev_delta{}; if (source < source_end) { - prev_value = unalignedLoad(source); - unalignedStore(dest, prev_value); + prev_value = unalignedLoad(source); + unalignedStore(dest, prev_value); source += sizeof(prev_value); dest += sizeof(prev_value); @@ -174,9 +213,9 @@ void decompressDataForType(const char * source, UInt32 source_size, char * dest) if (source < source_end) { - prev_delta = unalignedLoad(source); - prev_value = prev_value + static_cast(prev_delta); - unalignedStore(dest, prev_value); + prev_delta = unalignedLoad(source); + prev_value = prev_value + static_cast(prev_delta); + unalignedStore(dest, prev_value); source += sizeof(prev_delta); dest += sizeof(prev_value); @@ -189,32 +228,35 @@ void decompressDataForType(const char * source, UInt32 source_size, char * dest) // we have to keep track of items to avoid reading more that there is. for (UInt32 items_read = 2; items_read < items_count && !reader.eof(); ++items_read) { - DeltaType double_delta = 0; + UnsignedDeltaType double_delta = 0; if (reader.readBit() == 1) { - const UInt8 data_sizes[] = {6, 8, 11, 31, 63}; UInt8 i = 0; - for (; i < sizeof(data_sizes) - 1; ++i) + for (; i < sizeof(DELTA_SIZES) - 1; ++i) { const auto next_bit = reader.readBit(); if (next_bit == 0) + { break; + } } const UInt8 sign = reader.readBit(); - double_delta = static_cast(reader.readBits(data_sizes[i])); + SignedDeltaType signed_dd = static_cast(reader.readBits(DELTA_SIZES[i] - 1) + 1); if (sign) { - double_delta *= -1; + signed_dd *= -1; } + double_delta = static_cast(signed_dd); } // else if first bit is zero, no need to read more data. - const T curr_value = prev_value + static_cast(prev_delta + double_delta); - unalignedStore(dest, curr_value); + const UnsignedDeltaType delta = double_delta + prev_delta; + const ValueType curr_value = prev_value + delta; + unalignedStore(dest, curr_value); dest += sizeof(curr_value); - prev_delta = static_cast(curr_value - prev_value); + prev_delta = curr_value - prev_value; prev_value = curr_value; } } @@ -267,19 +309,20 @@ UInt32 CompressionCodecDoubleDelta::doCompressData(const char * source, UInt32 s memcpy(&dest[2], source, bytes_to_skip); size_t start_pos = 2 + bytes_to_skip; UInt32 compressed_size = 0; + switch (data_bytes_size) { case 1: - compressed_size = compressDataForType(&source[bytes_to_skip], source_size - bytes_to_skip, &dest[start_pos]); + compressed_size = compressDataForType(&source[bytes_to_skip], source_size - bytes_to_skip, &dest[start_pos]); break; case 2: - compressed_size = compressDataForType(&source[bytes_to_skip], source_size - bytes_to_skip, &dest[start_pos]); + compressed_size = compressDataForType(&source[bytes_to_skip], source_size - bytes_to_skip, &dest[start_pos]); break; case 4: - compressed_size = compressDataForType(&source[bytes_to_skip], source_size - bytes_to_skip, &dest[start_pos]); + compressed_size = compressDataForType(&source[bytes_to_skip], source_size - bytes_to_skip, &dest[start_pos]); break; case 8: - compressed_size = compressDataForType(&source[bytes_to_skip], source_size - bytes_to_skip, &dest[start_pos]); + compressed_size = compressDataForType(&source[bytes_to_skip], source_size - bytes_to_skip, &dest[start_pos]); break; } @@ -296,16 +339,16 @@ void CompressionCodecDoubleDelta::doDecompressData(const char * source, UInt32 s switch (bytes_size) { case 1: - decompressDataForType(&source[2 + bytes_to_skip], source_size_no_header, &dest[bytes_to_skip]); + decompressDataForType(&source[2 + bytes_to_skip], source_size_no_header, &dest[bytes_to_skip]); break; case 2: - decompressDataForType(&source[2 + bytes_to_skip], source_size_no_header, &dest[bytes_to_skip]); + decompressDataForType(&source[2 + bytes_to_skip], source_size_no_header, &dest[bytes_to_skip]); break; case 4: - decompressDataForType(&source[2 + bytes_to_skip], source_size_no_header, &dest[bytes_to_skip]); + decompressDataForType(&source[2 + bytes_to_skip], source_size_no_header, &dest[bytes_to_skip]); break; case 8: - decompressDataForType(&source[2 + bytes_to_skip], source_size_no_header, &dest[bytes_to_skip]); + decompressDataForType(&source[2 + bytes_to_skip], source_size_no_header, &dest[bytes_to_skip]); break; } } diff --git a/dbms/src/Compression/tests/gtest_compressionCodec.cpp b/dbms/src/Compression/tests/gtest_compressionCodec.cpp index e1413ccd7bd..43c390244f4 100644 --- a/dbms/src/Compression/tests/gtest_compressionCodec.cpp +++ b/dbms/src/Compression/tests/gtest_compressionCodec.cpp @@ -8,16 +8,16 @@ #include +#include #include #include #include -#include -#include -#include -#include -#include #include -#include +#include +#include +#include +#include + #include #pragma GCC diagnostic ignored "-Wsign-compare" @@ -119,36 +119,71 @@ template result = ::testing::AssertionFailure(); } - result << "mismatching " << sizeof(T) << "-byte item #" << i - << "\nexpected: " << bin(left_value) - << "\ngot : " << bin(right_value) - << std::endl; - - if (++mismatching_items >= MAX_MISMATCHING_ITEMS) + if (++mismatching_items <= MAX_MISMATCHING_ITEMS) { - result << "..." << std::endl; - break; + result << "mismatching " << sizeof(T) << "-byte item #" << i + << "\nexpected: " << bin(left_value) << " (0x" << std::hex << left_value << ")" + << "\ngot : " << bin(right_value) << " (0x" << std::hex << right_value << ")" + << std::endl; + if (mismatching_items == MAX_MISMATCHING_ITEMS) + { + result << "..." << std::endl; + } } } } + if (mismatching_items > 0) + { + result << "\ntotal mismatching items:" << mismatching_items << " of " << size; + } return result; } struct CodecTestParam { + std::string type_name; std::vector source_data; UInt8 data_byte_size; + double min_compression_ratio; std::string case_name; + + // to allow setting ratio after building with complex builder functions. + CodecTestParam && setRatio(const double & ratio) && + { + this->min_compression_ratio = ratio; + return std::move(*this); + } }; +CodecTestParam operator+(CodecTestParam && left, CodecTestParam && right) +{ + assert(left.type_name == right.type_name); + assert(left.data_byte_size == right.data_byte_size); + + std::vector data(std::move(left.source_data)); + data.insert(data.end(), right.source_data.begin(), right.source_data.end()); + + return CodecTestParam{ + left.type_name, + std::move(data), + left.data_byte_size, + std::min(left.min_compression_ratio, right.min_compression_ratio), + left.case_name + " + " + right.case_name + }; +} + std::ostream & operator<<(std::ostream & ostr, const CodecTestParam & param) { return ostr << "name: " << param.case_name + << "\ntype name:" << param.type_name << "\nbyte size: " << static_cast(param.data_byte_size) << "\ndata size: " << param.source_data.size(); } +// compression ratio < 1.0 means that codec output is smaller than input. +const double DEFAULT_MIN_COMPRESSION_RATIO = 1.0; + template CodecTestParam makeParam(Args && ... args) { @@ -162,11 +197,11 @@ CodecTestParam makeParam(Args && ... args) write_pos += sizeof(v); } - return CodecTestParam{std::move(data), sizeof(T), - (boost::format("%1% %2%") % (sizeof(T) * std::size(vals)) % " predefined values").str()}; + return CodecTestParam{type_name(), std::move(data), sizeof(T), DEFAULT_MIN_COMPRESSION_RATIO, + (boost::format("%1% values of %2%") % std::size(vals) % type_name()).str()}; } -template +template CodecTestParam generateParam(Generator gen, const char* gen_name) { static_assert (End >= Begin, "End must be not less than Begin"); @@ -181,8 +216,8 @@ CodecTestParam generateParam(Generator gen, const char* gen_name) write_pos += sizeof(v); } - return CodecTestParam{std::move(data), sizeof(T), - (boost::format("%1% from %2% (%3% => %4%)") % type_name() % gen_name % Begin % End).str()}; + return CodecTestParam{type_name(), std::move(data), sizeof(T), DEFAULT_MIN_COMPRESSION_RATIO, + (boost::format("%1% values of %2% from %3%") % (End - Begin) % type_name() % gen_name).str()}; } void TestTranscoding(ICompressionCodec * codec, const CodecTestParam & param) @@ -216,6 +251,13 @@ void TestTranscoding(ICompressionCodec * codec, const CodecTestParam & param) default: FAIL() << "Invalid data_byte_size: " << param.data_byte_size; } + const auto header_size = codec->getHeaderSize(); + const auto compression_ratio = (encoded_size - header_size) / (source_data.size() * 1.0); + + ASSERT_LE(compression_ratio, param.min_compression_ratio) + << "\n\tdecoded size: " << source_data.size() + << "\n\tencoded size: " << encoded_size + << "(no header: " << encoded_size - header_size << ")"; } class CodecTest : public ::testing::TestWithParam @@ -230,20 +272,34 @@ public: TEST_P(CodecTest, DoubleDelta) { - const auto & param = GetParam(); + auto param = GetParam(); auto codec = std::make_unique(param.data_byte_size); + if (param.type_name == type_name() || param.type_name == type_name()) + { + // dd doesn't work great with many cases of integers and may result in very poor compression rate. + param.min_compression_ratio *= 1.5; + } TestTranscoding(codec.get(), param); } TEST_P(CodecTest, Gorilla) { - const auto & param = GetParam(); + auto param = GetParam(); auto codec = std::make_unique(param.data_byte_size); + if (param.type_name == type_name() || param.type_name == type_name() + || param.type_name == type_name() || param.type_name == type_name()) + { + // gorilla doesn't work great with many cases of integers and may result in very poor compression rate. + param.min_compression_ratio *= 1.5; + } TestTranscoding(codec.get(), param); } +// Here we use generators to produce test payload for codecs. +// Generator is a callable that should produce output value of the same type as input value. + auto SameValueGenerator = [](auto value) { return [=](auto i) @@ -261,30 +317,44 @@ auto SequentialGenerator = [](auto stride = 1) }; }; +// Generator that helps debugging output of other generators +// by logging every output value alongside iteration index and input. +//auto LoggingProxyGenerator = [](auto other_generator, const char * name, std::ostream & ostr, const int limit = std::numeric_limits::max()) +//{ +// ostr << "\n\nValues from " << name << ":\n"; +// auto count = std::make_shared(0); +// return [&, count](auto i) +// { +// using ValueType = decltype(i); +// const auto ret = static_cast(other_generator(i)); +// if (++(*count) < limit) +// { +// ostr << "\t" << *count << " : " << i << " => " << ret << "\n"; +// } + +// return ret; +// }; +//}; + template struct MonotonicGenerator { MonotonicGenerator(T stride = 1, size_t max_step = 10) - : prev_value{}, + : prev_value(0), stride(stride), max_step(max_step) {} template - U operator()(U i) + U operator()(U) { - if (!prev_value.has_value()) - { - prev_value = i * stride; - } - - const U result = *prev_value + static_cast(stride * (rand() % max_step)); + const U result = prev_value + static_cast(stride * (rand() % max_step)); prev_value = result; return result; } - std::optional prev_value; + T prev_value; const T stride; const size_t max_step; }; @@ -301,25 +371,45 @@ auto MinMaxGenerator = [](auto i) } }; -auto RandomGenerator = [](auto i) {return static_cast(rand());}; +template +struct RandomGenerator +{ + RandomGenerator(T seed = 0, T value_cap = std::numeric_limits::max()) + : e(seed), + value_cap(value_cap) + { + } + + template + U operator()(U i) + { + return static_cast(distribution(e) % value_cap); + } + +private: + std::default_random_engine e; + std::uniform_int_distribution distribution; + const T value_cap; +}; auto RandomishGenerator = [](auto i) { return static_cast(sin(static_cast(i) * i) * i); }; -INSTANTIATE_TEST_CASE_P(Basic, +// helper macro to produce human-friendly test case name +#define G(generator) generator, #generator + +INSTANTIATE_TEST_CASE_P(Mixed, CodecTest, ::testing::Values( - makeParam(1, 2, 3, 4), - makeParam(1, 2, 3, 4), - makeParam(1.1, 2.2, 3.3, 4.4), - makeParam(1.1, 2.2, 3.3, 4.4) + generateParam(G(MinMaxGenerator)) + generateParam(G(SequentialGenerator(1))).setRatio(1), + generateParam(G(MinMaxGenerator)) + generateParam(G(SequentialGenerator(1))).setRatio(1), + generateParam(G(MinMaxGenerator)) + generateParam(G(SequentialGenerator(1))).setRatio(1), + generateParam(G(MinMaxGenerator)) + generateParam(G(SequentialGenerator(1))).setRatio(1) ), ); -#define G(generator) generator, #generator - INSTANTIATE_TEST_CASE_P(Same, CodecTest, ::testing::Values( @@ -359,18 +449,20 @@ INSTANTIATE_TEST_CASE_P(Monotonic, INSTANTIATE_TEST_CASE_P(Random, CodecTest, ::testing::Values( - generateParam(G(RandomGenerator)), - generateParam(G(RandomGenerator)) + generateParam(G(RandomGenerator(0, 1000'000'000))).setRatio(1.2), + generateParam(G(RandomGenerator(0, 1000'000'000))).setRatio(1.1) ), ); -INSTANTIATE_TEST_CASE_P(RandomLike, +INSTANTIATE_TEST_CASE_P(Randomish, CodecTest, ::testing::Values( - generateParam(G(RandomishGenerator)), - generateParam(G(RandomishGenerator)), - generateParam(G(RandomishGenerator)), - generateParam(G(RandomishGenerator)) + generateParam(G(RandomishGenerator)).setRatio(1.1), + generateParam(G(RandomishGenerator)).setRatio(1.1), + generateParam(G(RandomishGenerator)).setRatio(1.1), + generateParam(G(RandomishGenerator)).setRatio(1.1), + generateParam(G(RandomishGenerator)).setRatio(1.1), + generateParam(G(RandomishGenerator)).setRatio(1.1) ), ); diff --git a/dbms/src/IO/BitHelpers.h b/dbms/src/IO/BitHelpers.h index 3652dd0057a..1947d9d99ba 100644 --- a/dbms/src/IO/BitHelpers.h +++ b/dbms/src/IO/BitHelpers.h @@ -150,7 +150,6 @@ public: const UInt64 mask = maskLowBits(to_write); v &= mask; -// assert(v <= 255); bits_buffer <<= to_write; bits_buffer |= v; From 68ba42cc897cdc8dadc8e58bcb445dc8c0e29213 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 15 Jul 2019 17:45:56 +0300 Subject: [PATCH 08/26] Using std::gcd, lcm instead of boost --- dbms/src/Functions/gcd.cpp | 5 +++-- dbms/src/Functions/lcm.cpp | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/dbms/src/Functions/gcd.cpp b/dbms/src/Functions/gcd.cpp index 0373799bd74..d48318ce115 100644 --- a/dbms/src/Functions/gcd.cpp +++ b/dbms/src/Functions/gcd.cpp @@ -1,6 +1,7 @@ #include #include -#include +#include + namespace DB { @@ -15,7 +16,7 @@ struct GCDImpl { throwIfDivisionLeadsToFPE(typename NumberTraits::ToInteger::Type(a), typename NumberTraits::ToInteger::Type(b)); throwIfDivisionLeadsToFPE(typename NumberTraits::ToInteger::Type(b), typename NumberTraits::ToInteger::Type(a)); - return boost::integer::gcd( + return std::gcd( typename NumberTraits::ToInteger::Type(a), typename NumberTraits::ToInteger::Type(b)); } diff --git a/dbms/src/Functions/lcm.cpp b/dbms/src/Functions/lcm.cpp index bf2b7e1e79f..a76519a6603 100644 --- a/dbms/src/Functions/lcm.cpp +++ b/dbms/src/Functions/lcm.cpp @@ -1,6 +1,7 @@ #include #include -#include +#include + namespace DB { @@ -15,7 +16,7 @@ struct LCMImpl { throwIfDivisionLeadsToFPE(typename NumberTraits::ToInteger::Type(a), typename NumberTraits::ToInteger::Type(b)); throwIfDivisionLeadsToFPE(typename NumberTraits::ToInteger::Type(b), typename NumberTraits::ToInteger::Type(a)); - return boost::integer::lcm( + return std::lcm( typename NumberTraits::ToInteger::Type(a), typename NumberTraits::ToInteger::Type(b)); } From e7b23d4fb2965d08e5846c49baeea5ff33b830df Mon Sep 17 00:00:00 2001 From: Mikhail Filimonov Date: Mon, 15 Jul 2019 16:53:16 +0200 Subject: [PATCH 09/26] configs for query masking rules tests --- dbms/tests/config/query_masking_rules.xml | 10 ++++++++++ dbms/tests/server-test.xml | 6 ++++++ 2 files changed, 16 insertions(+) create mode 100644 dbms/tests/config/query_masking_rules.xml diff --git a/dbms/tests/config/query_masking_rules.xml b/dbms/tests/config/query_masking_rules.xml new file mode 100644 index 00000000000..5a854848f3d --- /dev/null +++ b/dbms/tests/config/query_masking_rules.xml @@ -0,0 +1,10 @@ + + + + + + TOPSECRET.TOPSECRET + [hidden] + + + diff --git a/dbms/tests/server-test.xml b/dbms/tests/server-test.xml index c20d34cce3f..d68cbca53c1 100644 --- a/dbms/tests/server-test.xml +++ b/dbms/tests/server-test.xml @@ -117,4 +117,10 @@ /clickhouse/task_queue/ddl /tmp/clickhouse/data/format_schemas/ + + + TOPSECRET.TOPSECRET + [hidden] + + From ff8214c65ed6b73dff35b55d734aa71b58acd993 Mon Sep 17 00:00:00 2001 From: Mikhail Filimonov Date: Mon, 15 Jul 2019 16:57:15 +0200 Subject: [PATCH 10/26] Dockerfile adjustment --- docker/test/stateless/Dockerfile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docker/test/stateless/Dockerfile b/docker/test/stateless/Dockerfile index 4e1a20d848b..71fc6b3f40a 100644 --- a/docker/test/stateless/Dockerfile +++ b/docker/test/stateless/Dockerfile @@ -39,13 +39,14 @@ CMD dpkg -i package_folder/clickhouse-common-static_*.deb; \ ln -s /usr/share/clickhouse-test/config/zookeeper.xml /etc/clickhouse-server/config.d/; \ ln -s /usr/share/clickhouse-test/config/listen.xml /etc/clickhouse-server/config.d/; \ ln -s /usr/share/clickhouse-test/config/part_log.xml /etc/clickhouse-server/config.d/; \ + ln -s /usr/share/clickhouse-test/config/query_masking_rules.xml /etc/clickhouse-server/config.d/; \ ln -s /usr/share/clickhouse-test/config/log_queries.xml /etc/clickhouse-server/users.d/; \ ln -s /usr/share/clickhouse-test/config/readonly.xml /etc/clickhouse-server/users.d/; \ ln -s /usr/share/clickhouse-test/config/ints_dictionary.xml /etc/clickhouse-server/; \ ln -s /usr/share/clickhouse-test/config/strings_dictionary.xml /etc/clickhouse-server/; \ ln -s /usr/share/clickhouse-test/config/decimals_dictionary.xml /etc/clickhouse-server/; \ ln -s /usr/lib/llvm-8/bin/llvm-symbolizer /usr/bin/llvm-symbolizer; \ - echo "TSAN_OPTIONS='verbosity=1000 halt_on_error=1 history_size=7'" >> /etc/environment; \ + echo "TSAN_OPTIONS='halt_on_error=1 history_size=7'" >> /etc/environment; \ echo "UBSAN_OPTIONS='print_stacktrace=1'" >> /etc/environment; \ echo "ASAN_SYMBOLIZER_PATH=/usr/lib/llvm-6.0/bin/llvm-symbolizer" >> /etc/environment; \ echo "UBSAN_SYMBOLIZER_PATH=/usr/lib/llvm-6.0/bin/llvm-symbolizer" >> /etc/environment; \ From be718f320a12714c399373f27a56687d7c93ec74 Mon Sep 17 00:00:00 2001 From: filimonov <1549571+filimonov@users.noreply.github.com> Date: Mon, 15 Jul 2019 17:01:26 +0200 Subject: [PATCH 11/26] fix accidental change --- docker/test/stateless/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/test/stateless/Dockerfile b/docker/test/stateless/Dockerfile index 71fc6b3f40a..e2cd5eee933 100644 --- a/docker/test/stateless/Dockerfile +++ b/docker/test/stateless/Dockerfile @@ -46,7 +46,7 @@ CMD dpkg -i package_folder/clickhouse-common-static_*.deb; \ ln -s /usr/share/clickhouse-test/config/strings_dictionary.xml /etc/clickhouse-server/; \ ln -s /usr/share/clickhouse-test/config/decimals_dictionary.xml /etc/clickhouse-server/; \ ln -s /usr/lib/llvm-8/bin/llvm-symbolizer /usr/bin/llvm-symbolizer; \ - echo "TSAN_OPTIONS='halt_on_error=1 history_size=7'" >> /etc/environment; \ + echo "TSAN_OPTIONS='verbosity=1000 halt_on_error=1 history_size=7'" >> /etc/environment; \ echo "UBSAN_OPTIONS='print_stacktrace=1'" >> /etc/environment; \ echo "ASAN_SYMBOLIZER_PATH=/usr/lib/llvm-6.0/bin/llvm-symbolizer" >> /etc/environment; \ echo "UBSAN_SYMBOLIZER_PATH=/usr/lib/llvm-6.0/bin/llvm-symbolizer" >> /etc/environment; \ From b3c0292613c759bec920cbe00b1a85c847c4ef97 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 15 Jul 2019 19:32:52 +0300 Subject: [PATCH 12/26] Fixed error; added test --- dbms/src/Interpreters/ThreadStatusExt.cpp | 3 ++- .../queries/0_stateless/00971_query_id_in_logs.reference | 0 dbms/tests/queries/0_stateless/00971_query_id_in_logs.sh | 9 +++++++++ 3 files changed, 11 insertions(+), 1 deletion(-) create mode 100644 dbms/tests/queries/0_stateless/00971_query_id_in_logs.reference create mode 100755 dbms/tests/queries/0_stateless/00971_query_id_in_logs.sh diff --git a/dbms/src/Interpreters/ThreadStatusExt.cpp b/dbms/src/Interpreters/ThreadStatusExt.cpp index 3667ff6c492..f9b872ae292 100644 --- a/dbms/src/Interpreters/ThreadStatusExt.cpp +++ b/dbms/src/Interpreters/ThreadStatusExt.cpp @@ -29,6 +29,7 @@ namespace ErrorCodes void ThreadStatus::attachQueryContext(Context & query_context_) { query_context = &query_context_; + query_id = query_context->getCurrentQueryId(); if (!global_context) global_context = &query_context->getGlobalContext(); @@ -270,7 +271,7 @@ void CurrentThread::attachQueryContext(Context & query_context) { if (unlikely(!current_thread)) return; - return current_thread->attachQueryContext(query_context); + current_thread->attachQueryContext(query_context); } void CurrentThread::finalizePerformanceCounters() diff --git a/dbms/tests/queries/0_stateless/00971_query_id_in_logs.reference b/dbms/tests/queries/0_stateless/00971_query_id_in_logs.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/dbms/tests/queries/0_stateless/00971_query_id_in_logs.sh b/dbms/tests/queries/0_stateless/00971_query_id_in_logs.sh new file mode 100755 index 00000000000..37563ee4f46 --- /dev/null +++ b/dbms/tests/queries/0_stateless/00971_query_id_in_logs.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +. $CURDIR/../shell_config.sh + +set -e + +# No log lines without query id +$CLICKHOUSE_CLIENT --send_logs_level=trace --query_id=hello --query="SELECT count() FROM numbers(10)" 2>&1 | grep -vF ' {hello} ' | grep -P '<\w+>' From dfe178d653cdaf30ed88d9c47cca567b0802099a Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 15 Jul 2019 20:00:18 +0300 Subject: [PATCH 13/26] Disable mimalloc, because we suspect it zeroes out random memory segments --- cmake/find_mimalloc.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/find_mimalloc.cmake b/cmake/find_mimalloc.cmake index 6e3f24625b6..ada53387cf2 100644 --- a/cmake/find_mimalloc.cmake +++ b/cmake/find_mimalloc.cmake @@ -1,5 +1,5 @@ if (OS_LINUX AND NOT SANITIZE AND NOT ARCH_ARM AND NOT ARCH_32 AND NOT ARCH_PPC64LE) - option (ENABLE_MIMALLOC "Set to FALSE to disable usage of mimalloc for internal ClickHouse caches" ${NOT_UNBUNDLED}) + option (ENABLE_MIMALLOC "Set to FALSE to disable usage of mimalloc for internal ClickHouse caches" FALSE) endif () if (NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/mimalloc/include/mimalloc.h") From f375a4559afe244f5552c8476f4be45dbbb2c8d1 Mon Sep 17 00:00:00 2001 From: Danila Kutenin Date: Mon, 15 Jul 2019 20:49:54 +0300 Subject: [PATCH 14/26] Add exception checking in mimalloc --- dbms/src/Common/MiAllocator.cpp | 39 ++++++++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 5 deletions(-) diff --git a/dbms/src/Common/MiAllocator.cpp b/dbms/src/Common/MiAllocator.cpp index 456609374ee..cafa6c135f7 100644 --- a/dbms/src/Common/MiAllocator.cpp +++ b/dbms/src/Common/MiAllocator.cpp @@ -5,15 +5,33 @@ #include "MiAllocator.h" #include +#include +#include +#include + namespace DB { +namespace ErrorCodes +{ + extern const int CANNOT_ALLOCATE_MEMORY; +} void * MiAllocator::alloc(size_t size, size_t alignment) { + void * ptr; if (alignment == 0) - return mi_malloc(size); + { + ptr = mi_malloc(size); + if (!ptr) + DB::throwFromErrno("MiAllocator: Cannot allocate in mimalloc " + formatReadableSizeWithBinarySuffix(size) + ".", DB::ErrorCodes::CANNOT_ALLOCATE_MEMORY); + } else - return mi_malloc_aligned(size, alignment); + { + ptr = mi_malloc_aligned(size, alignment); + if (!ptr) + DB::throwFromErrno("MiAllocator: Cannot allocate in mimalloc (mi_malloc_aligned) " + formatReadableSizeWithBinarySuffix(size) + " with alignment " + toString(alignment) + ".", DB::ErrorCodes::CANNOT_ALLOCATE_MEMORY); + } + return ptr; } void MiAllocator::free(void * buf, size_t) @@ -32,10 +50,21 @@ void * MiAllocator::realloc(void * old_ptr, size_t, size_t new_size, size_t alig return nullptr; } - if (alignment == 0) - return mi_realloc(old_ptr, alignment); + void * ptr; - return mi_realloc_aligned(old_ptr, new_size, alignment); + if (alignment == 0) + { + ptr = mi_realloc(old_ptr, alignment); + if (!ptr) + DB::throwFromErrno("MiAllocator: Cannot reallocate in mimalloc " + formatReadableSizeWithBinarySuffix(size) + ".", DB::ErrorCodes::CANNOT_ALLOCATE_MEMORY); + } + else + { + ptr = mi_realloc_aligned(old_ptr, new_size, alignment); + if (!ptr) + DB::throwFromErrno("MiAllocator: Cannot reallocate in mimalloc (mi_realloc_aligned) " + formatReadableSizeWithBinarySuffix(size) + " with alignment " + toString(alignment) + ".", DB::ErrorCodes::CANNOT_ALLOCATE_MEMORY); + } + return ptr; } } From 25ed73779885517b43c35ee72a6614ebbaf1890e Mon Sep 17 00:00:00 2001 From: alesapin Date: Mon, 15 Jul 2019 21:05:49 +0300 Subject: [PATCH 15/26] Fix bug client version which is able to read additional progress fields from server --- dbms/src/Core/Defines.h | 2 +- dbms/tests/integration/test_old_versions_client/test.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/dbms/src/Core/Defines.h b/dbms/src/Core/Defines.h index f611f9df04e..461278fad3b 100644 --- a/dbms/src/Core/Defines.h +++ b/dbms/src/Core/Defines.h @@ -56,7 +56,7 @@ #define DBMS_MIN_REVISION_WITH_LOW_CARDINALITY_TYPE 54405 -#define DBMS_MIN_REVISION_WITH_CLIENT_WRITE_INFO 54421 +#define DBMS_MIN_REVISION_WITH_CLIENT_WRITE_INFO 54420 /// Version of ClickHouse TCP protocol. Set to git tag with latest protocol change. #define DBMS_TCP_PROTOCOL_VERSION 54226 diff --git a/dbms/tests/integration/test_old_versions_client/test.py b/dbms/tests/integration/test_old_versions_client/test.py index 2977d82ee9a..59a4aaca31c 100644 --- a/dbms/tests/integration/test_old_versions_client/test.py +++ b/dbms/tests/integration/test_old_versions_client/test.py @@ -11,13 +11,14 @@ node18_14 = cluster.add_instance('node18_14', image='yandex/clickhouse-server:18 node19_1 = cluster.add_instance('node19_1', image='yandex/clickhouse-server:19.1.16', with_installed_binary=True) node19_4 = cluster.add_instance('node19_4', image='yandex/clickhouse-server:19.4.5.35', with_installed_binary=True) node19_6 = cluster.add_instance('node19_6', image='yandex/clickhouse-server:19.6.3.18', with_installed_binary=True) +node19_8 = cluster.add_instance('node19_8', image='yandex/clickhouse-server:19.8.3.8', with_installed_binary=True) node_new = cluster.add_instance('node_new') @pytest.fixture(scope="module") def setup_nodes(): try: cluster.start() - for n in (node18_14, node19_1, node19_4, node19_6, node_new): + for n in (node18_14, node19_1, node19_4, node19_6, node19_8, node_new): n.query('''CREATE TABLE test_table (id UInt32, value UInt64) ENGINE = MergeTree() ORDER BY tuple()''') yield cluster @@ -29,7 +30,7 @@ def query_from_one_node_to_another(client_node, server_node, query): client_node.exec_in_container(["bash", "-c", "/usr/bin/clickhouse client --host {} --query '{}'".format(server_node.name, query)]) def test_client_from_different_versions(setup_nodes): - old_nodes = (node18_14, node19_1, node19_4, node19_6,) + old_nodes = (node18_14, node19_1, node19_4, node19_6, node19_8) # from new to old for n in old_nodes: query_from_one_node_to_another(node_new, n, "INSERT INTO test_table VALUES (1, 1)") From a175c5513672bb6f88de64c0b397bc6be3627978 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 15 Jul 2019 21:49:30 +0300 Subject: [PATCH 16/26] Added a test that shows catastrophic behaviour of mimalloc --- dbms/src/Common/tests/CMakeLists.txt | 3 + dbms/src/Common/tests/mi_malloc_test.cpp | 92 ++++++++++++++++++++++++ 2 files changed, 95 insertions(+) create mode 100644 dbms/src/Common/tests/mi_malloc_test.cpp diff --git a/dbms/src/Common/tests/CMakeLists.txt b/dbms/src/Common/tests/CMakeLists.txt index fb28dd14a43..83d69b2c8f2 100644 --- a/dbms/src/Common/tests/CMakeLists.txt +++ b/dbms/src/Common/tests/CMakeLists.txt @@ -75,3 +75,6 @@ target_link_libraries (cow_compositions PRIVATE clickhouse_common_io) add_executable (stopwatch stopwatch.cpp) target_link_libraries (stopwatch PRIVATE clickhouse_common_io) + +add_executable (mi_malloc_test mi_malloc_test.cpp) +target_link_libraries (mi_malloc_test PRIVATE clickhouse_common_io) diff --git a/dbms/src/Common/tests/mi_malloc_test.cpp b/dbms/src/Common/tests/mi_malloc_test.cpp new file mode 100644 index 00000000000..28df99eeb8f --- /dev/null +++ b/dbms/src/Common/tests/mi_malloc_test.cpp @@ -0,0 +1,92 @@ +#include + +//#undef USE_MIMALLOC +//#define USE_MIMALLOC 0 + +#if USE_MIMALLOC + +#include +#define malloc mi_malloc +#define free mi_free + +#else + +#include + +#endif + +#include +#include +#include +#include +#include +#include + + +size_t total_size{0}; + +struct Allocation +{ + void * ptr = nullptr; + size_t size = 0; + + Allocation() {} + + Allocation(size_t size) + : size(size) + { + ptr = malloc(size); + if (!ptr) + throw std::runtime_error("Cannot allocate memory"); + total_size += size; + } + + ~Allocation() + { + if (ptr) + { + free(ptr); + total_size -= size; + } + ptr = nullptr; + } + + Allocation(const Allocation &) = delete; + + Allocation(Allocation && rhs) + { + ptr = rhs.ptr; + size = rhs.size; + rhs.ptr = nullptr; + rhs.size = 0; + } +}; + + +int main(int, char **) +{ + std::vector allocations; + + constexpr size_t limit = 100000000; + constexpr size_t min_alloc_size = 65536; + constexpr size_t max_alloc_size = 10000000; + + std::mt19937 rng; + auto distribution = std::uniform_int_distribution(min_alloc_size, max_alloc_size); + + size_t total_allocations = 0; + + while (true) + { + size_t size = distribution(rng); + + while (total_size + size > limit) + allocations.pop_back(); + + allocations.emplace_back(size); + + ++total_allocations; + if (total_allocations % (1ULL << 20) == 0) + std::cerr << "Total allocations: " << total_allocations << "\n"; + } +} From 1937623f18d7678812d9312fed538b6b5d8207de Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 15 Jul 2019 22:13:19 +0300 Subject: [PATCH 17/26] Better test --- dbms/src/Common/tests/mi_malloc_test.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/dbms/src/Common/tests/mi_malloc_test.cpp b/dbms/src/Common/tests/mi_malloc_test.cpp index 28df99eeb8f..6a565b71ae6 100644 --- a/dbms/src/Common/tests/mi_malloc_test.cpp +++ b/dbms/src/Common/tests/mi_malloc_test.cpp @@ -1,3 +1,10 @@ +#include +#include +#include +#include +#include +#include + #include //#undef USE_MIMALLOC @@ -15,13 +22,6 @@ #endif -#include -#include -#include -#include -#include -#include - size_t total_size{0}; From 0d371473773769bc37409bc91c76ee9ee10d6a25 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 15 Jul 2019 22:21:37 +0300 Subject: [PATCH 18/26] Added fatal error in cmake --- cmake/find_mimalloc.cmake | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cmake/find_mimalloc.cmake b/cmake/find_mimalloc.cmake index ada53387cf2..1820421379f 100644 --- a/cmake/find_mimalloc.cmake +++ b/cmake/find_mimalloc.cmake @@ -8,6 +8,8 @@ if (NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/mimalloc/include/mimalloc.h") endif () if (ENABLE_MIMALLOC) + message (FATAL_ERROR "Mimalloc is not production ready. (Disable with cmake -D ENABLE_MIMALLOC=0). If you want to use mimalloc, you must manually remove this message.") + set (MIMALLOC_INCLUDE_DIR ${ClickHouse_SOURCE_DIR}/contrib/mimalloc/include) set (USE_MIMALLOC 1) set (MIMALLOC_LIBRARY mimalloc-static) From 066337e197f7cf2809c9e833ca6fa10bfd660532 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 15 Jul 2019 22:29:54 +0300 Subject: [PATCH 19/26] Fixed test --- dbms/tests/queries/0_stateless/00971_query_id_in_logs.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbms/tests/queries/0_stateless/00971_query_id_in_logs.sh b/dbms/tests/queries/0_stateless/00971_query_id_in_logs.sh index 37563ee4f46..a4ef7671f48 100755 --- a/dbms/tests/queries/0_stateless/00971_query_id_in_logs.sh +++ b/dbms/tests/queries/0_stateless/00971_query_id_in_logs.sh @@ -6,4 +6,4 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) set -e # No log lines without query id -$CLICKHOUSE_CLIENT --send_logs_level=trace --query_id=hello --query="SELECT count() FROM numbers(10)" 2>&1 | grep -vF ' {hello} ' | grep -P '<\w+>' +$CLICKHOUSE_CLIENT --send_logs_level=trace --query_id=hello --query="SELECT count() FROM numbers(10)" 2>&1 | grep -vF ' {hello} ' | grep -P '<\w+>' ||: From 21958c47cd50b5b08a51d753703ceb8c67b39f98 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 15 Jul 2019 23:06:20 +0300 Subject: [PATCH 20/26] Added failing test --- ...form_read_distribution_and_max_rows_to_read.sql | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 dbms/tests/queries/0_stateless/00971_merge_tree_uniform_read_distribution_and_max_rows_to_read.sql diff --git a/dbms/tests/queries/0_stateless/00971_merge_tree_uniform_read_distribution_and_max_rows_to_read.sql b/dbms/tests/queries/0_stateless/00971_merge_tree_uniform_read_distribution_and_max_rows_to_read.sql new file mode 100644 index 00000000000..cdfcd105470 --- /dev/null +++ b/dbms/tests/queries/0_stateless/00971_merge_tree_uniform_read_distribution_and_max_rows_to_read.sql @@ -0,0 +1,14 @@ +DROP TABLE IF EXISTS merge_tree; +CREATE TABLE merge_tree (x UInt8) ENGINE = MergeTree ORDER BY x; +INSERT INTO merge_tree SELECT 0 FROM numbers(1000000); + +SET max_threads = 4; +SET max_rows_to_read = 1100000; + +SELECT count() FROM merge_tree; + +SET merge_tree_uniform_read_distribution = 0; + +SELECT count() FROM merge_tree; + +DROP TABLE merge_tree; From 77fb3de2d6160e61459127f0872b585cbe9f9e62 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 15 Jul 2019 23:26:22 +0300 Subject: [PATCH 21/26] Improved test --- ...rm_read_distribution_and_max_rows_to_read.reference | 2 ++ ..._uniform_read_distribution_and_max_rows_to_read.sql | 10 +++++++++- 2 files changed, 11 insertions(+), 1 deletion(-) create mode 100644 dbms/tests/queries/0_stateless/00971_merge_tree_uniform_read_distribution_and_max_rows_to_read.reference diff --git a/dbms/tests/queries/0_stateless/00971_merge_tree_uniform_read_distribution_and_max_rows_to_read.reference b/dbms/tests/queries/0_stateless/00971_merge_tree_uniform_read_distribution_and_max_rows_to_read.reference new file mode 100644 index 00000000000..fcd78da1283 --- /dev/null +++ b/dbms/tests/queries/0_stateless/00971_merge_tree_uniform_read_distribution_and_max_rows_to_read.reference @@ -0,0 +1,2 @@ +1000000 +1000000 diff --git a/dbms/tests/queries/0_stateless/00971_merge_tree_uniform_read_distribution_and_max_rows_to_read.sql b/dbms/tests/queries/0_stateless/00971_merge_tree_uniform_read_distribution_and_max_rows_to_read.sql index cdfcd105470..37d09a3d3dd 100644 --- a/dbms/tests/queries/0_stateless/00971_merge_tree_uniform_read_distribution_and_max_rows_to_read.sql +++ b/dbms/tests/queries/0_stateless/00971_merge_tree_uniform_read_distribution_and_max_rows_to_read.sql @@ -5,10 +5,18 @@ INSERT INTO merge_tree SELECT 0 FROM numbers(1000000); SET max_threads = 4; SET max_rows_to_read = 1100000; +SET merge_tree_uniform_read_distribution = 1; SELECT count() FROM merge_tree; SET merge_tree_uniform_read_distribution = 0; - SELECT count() FROM merge_tree; +SET max_rows_to_read = 900000; + +SET merge_tree_uniform_read_distribution = 1; +SELECT count() FROM merge_tree; -- { serverError 158 } + +SET merge_tree_uniform_read_distribution = 0; +SELECT count() FROM merge_tree; -- { serverError 158 } + DROP TABLE merge_tree; From 27b9f79cba8ff1a8de428c0ca0d287f9aadd17c7 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 15 Jul 2019 23:26:55 +0300 Subject: [PATCH 22/26] Fixed error after refactoring --- .../Storages/MergeTree/MergeTreeSelectBlockInputStream.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dbms/src/Storages/MergeTree/MergeTreeSelectBlockInputStream.cpp b/dbms/src/Storages/MergeTree/MergeTreeSelectBlockInputStream.cpp index a6d2b32fb19..e1c5e5239b5 100644 --- a/dbms/src/Storages/MergeTree/MergeTreeSelectBlockInputStream.cpp +++ b/dbms/src/Storages/MergeTree/MergeTreeSelectBlockInputStream.cpp @@ -46,13 +46,13 @@ MergeTreeSelectBlockInputStream::MergeTreeSelectBlockInputStream( for (const auto & range : all_mark_ranges) total_marks_count += range.end - range.begin; - size_t total_rows = data_part->index_granularity.getTotalRows(); + size_t total_rows = data_part->index_granularity.getRowsCountInRanges(all_mark_ranges); if (!quiet) LOG_TRACE(log, "Reading " << all_mark_ranges.size() << " ranges from part " << data_part->name << ", approx. " << total_rows << (all_mark_ranges.size() > 1 - ? ", up to " + toString(data_part->index_granularity.getRowsCountInRanges(all_mark_ranges)) + ? ", up to " + toString(total_rows) : "") << " rows starting from " << data_part->index_granularity.getMarkStartingRow(all_mark_ranges.front().begin)); From ea54cdbededbd304c7f0c08687b362efa4373527 Mon Sep 17 00:00:00 2001 From: alexey-milovidov Date: Tue, 16 Jul 2019 00:53:23 +0300 Subject: [PATCH 23/26] Update CompressionCodecDoubleDelta.cpp --- dbms/src/Compression/CompressionCodecDoubleDelta.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/dbms/src/Compression/CompressionCodecDoubleDelta.cpp b/dbms/src/Compression/CompressionCodecDoubleDelta.cpp index 4e3f9eb9cd1..d3830dc9fdb 100644 --- a/dbms/src/Compression/CompressionCodecDoubleDelta.cpp +++ b/dbms/src/Compression/CompressionCodecDoubleDelta.cpp @@ -38,11 +38,10 @@ Int64 getMaxValueForByteSize(UInt8 byte_size) return std::numeric_limits::max(); case sizeof(UInt64): return std::numeric_limits::max(); - default: - assert(false && "only 1,2,4 and 8 data sizes are supported"); + default: + assert(false && "only 1, 2, 4 and 8 data sizes are supported"); } - - return std::numeric_limits::max(); + __builtin_unreachable(); } struct WriteSpec From 4ed1592cb61794d596f5bc0c69b61aec43b6ab2c Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Tue, 16 Jul 2019 01:00:06 +0300 Subject: [PATCH 24/26] Added license for test --- dbms/src/Common/tests/mi_malloc_test.cpp | 26 ++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/dbms/src/Common/tests/mi_malloc_test.cpp b/dbms/src/Common/tests/mi_malloc_test.cpp index 6a565b71ae6..d9ee75fba6e 100644 --- a/dbms/src/Common/tests/mi_malloc_test.cpp +++ b/dbms/src/Common/tests/mi_malloc_test.cpp @@ -1,3 +1,29 @@ +/** In addition to ClickHouse (Apache 2) license, this file can be also used under MIT license: + +MIT License + +Copyright (c) 2019 Yandex LLC, Alexey Milovidov + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + #include #include #include From bdff08a7aa922339498eae914c2b1bbb059ba8c2 Mon Sep 17 00:00:00 2001 From: Andy Yang Date: Tue, 16 Jul 2019 16:43:45 +0800 Subject: [PATCH 25/26] Fix a little problem in the document of hash_functions --- docs/en/query_language/functions/hash_functions.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/query_language/functions/hash_functions.md b/docs/en/query_language/functions/hash_functions.md index 14577a6afe4..368cfa1622b 100644 --- a/docs/en/query_language/functions/hash_functions.md +++ b/docs/en/query_language/functions/hash_functions.md @@ -212,7 +212,7 @@ SELECT metroHash64(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23:00: ## jumpConsistentHash Calculates JumpConsistentHash form a UInt64. -Accepts a UInt64-type argument. Returns Int32. +Accepts two arguments: a UInt64-type key and the number of buckets. Returns Int32. For more information, see the link: [JumpConsistentHash](https://arxiv.org/pdf/1406.2294.pdf) ## murmurHash2_32, murmurHash2_64 From 0357c118513b7d491f81756f4d82e8ac444be45e Mon Sep 17 00:00:00 2001 From: BayoNet Date: Tue, 16 Jul 2019 12:21:48 +0300 Subject: [PATCH 26/26] DOCAPI-7129 Nested JSON in JSONEachRow description + date_time_input_format (#5889) * The input_format_import_nested_json and date_time_input_format settings description. * Usage of Nested Structures with JSONEachRow. --- docs/en/interfaces/formats.md | 56 ++++++++- docs/en/operations/settings/settings.md | 40 +++++++ .../functions/array_functions.md | 107 ++++++++---------- docs/fa/interfaces/formats.md | 54 +++++++++ docs/zh/interfaces/formats.md | 54 +++++++++ 5 files changed, 251 insertions(+), 60 deletions(-) diff --git a/docs/en/interfaces/formats.md b/docs/en/interfaces/formats.md index 7d5add49bb4..71f28263270 100644 --- a/docs/en/interfaces/formats.md +++ b/docs/en/interfaces/formats.md @@ -323,7 +323,7 @@ When using this format, ClickHouse outputs rows as separated, newline-delimited ```json {"SearchPhrase":"curtain designs","count()":"1064"} {"SearchPhrase":"baku","count()":"1000"} -{"SearchPhrase":"","count":"8267016"} +{"SearchPhrase":"","count()":"8267016"} ``` When inserting the data, you should provide a separate JSON object for each row. @@ -386,6 +386,60 @@ Unlike the [JSON](#json) format, there is no substitution of invalid UTF-8 seque !!! note "Note" Any set of bytes can be output in the strings. Use the `JSONEachRow` format if you are sure that the data in the table can be formatted as JSON without losing any information. +### Usage of Nested Structures {#jsoneachrow-nested} + +If you have a table with the [Nested](../data_types/nested_data_structures/nested.md) data type columns, you can insert JSON data having the same structure. Enable this functionality with the [input_format_import_nested_json](../operations/settings/settings.md#settings-input_format_import_nested_json) setting. + +For example, consider the following table: + +```sql +CREATE TABLE json_each_row_nested (n Nested (s String, i Int32) ) ENGINE = Memory +``` + +As you can find in the `Nested` data type description, ClickHouse treats each component of the nested structure as a separate column, `n.s` and `n.i` for our table. So you can insert the data the following way: + +```sql +INSERT INTO json_each_row_nested FORMAT JSONEachRow {"n.s": ["abc", "def"], "n.i": [1, 23]} +``` + +To insert data as hierarchical JSON object set [input_format_import_nested_json=1](../operations/settings/settings.md#settings-input_format_import_nested_json). + +```json +{ + "n": { + "s": ["abc", "def"], + "i": [1, 23] + } +} +``` + +Without this setting ClickHouse throws the exception. + +```sql +SELECT name, value FROM system.settings WHERE name = 'input_format_import_nested_json' +``` +```text +┌─name────────────────────────────┬─value─┐ +│ input_format_import_nested_json │ 0 │ +└─────────────────────────────────┴───────┘ +``` +```sql +INSERT INTO json_each_row_nested FORMAT JSONEachRow {"n": {"s": ["abc", "def"], "i": [1, 23]}} +``` +```text +Code: 117. DB::Exception: Unknown field found while parsing JSONEachRow format: n: (at row 1) +``` +```sql +SET input_format_import_nested_json=1 +INSERT INTO json_each_row_nested FORMAT JSONEachRow {"n": {"s": ["abc", "def"], "i": [1, 23]}} +SELECT * FROM json_each_row_nested +``` +```text +┌─n.s───────────┬─n.i────┐ +│ ['abc','def'] │ [1,23] │ +└───────────────┴────────┘ +``` + ## Native {#native} The most efficient format. Data is written and read by blocks in binary format. For each block, the number of rows, number of columns, column names and types, and parts of columns in this block are recorded one after another. In other words, this format is "columnar" – it doesn't convert columns to rows. This is the format used in the native interface for interaction between servers, for using the command-line client, and for C++ clients. diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index 5d299e72bdf..c68643d3877 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -231,6 +231,25 @@ Possible values: Default value: 0. +## input_format_import_nested_json {#settings-input_format_import_nested_json} + +Enables or disables inserting of JSON data with nested objects. + +Supported formats: + +- [JSONEachRow](../../interfaces/formats.md#jsoneachrow) + +Possible values: + +- 0 — Disabled. +- 1 — Enabled. + +Default value: 0. + +**See Also** + +- [Usage of Nested Structures](../../interfaces/formats.md#jsoneachrow-nested) with the `JSONEachRow` format. + ## input_format_with_names_use_header {#settings-input_format_with_names_use_header} Enables or disables checking the column order when inserting data. @@ -249,6 +268,27 @@ Possible values: Default value: 1. +## date_time_input_format {#settings-date_time_input_format} + +Enables or disables extended parsing of date and time formatted strings. + +The setting doesn't apply to [date and time functions](../../query_language/functions/date_time_functions.md). + +Possible values: + +- `'best_effort'` — Enables extended parsing. + + ClickHouse can parse the basic format `YYYY-MM-DD HH:MM:SS` and all the [ISO 8601](https://en.wikipedia.org/wiki/ISO_8601) date and time formats. For example, `'2018-06-08T01:02:03.000Z'`. + +- `'basic'` — Use basic parser. + + ClickHouse can parse only the basic format. + +**See Also** + +- [DateTime data type.](../../data_types/datetime.md) +- [Functions for working with dates and times.](../../query_language/functions/date_time_functions.md) + ## join_default_strictness {#settings-join_default_strictness} Sets default strictness for [JOIN clauses](../../query_language/select.md#select-join). diff --git a/docs/en/query_language/functions/array_functions.md b/docs/en/query_language/functions/array_functions.md index 938a10ee8d0..94d79a1898b 100644 --- a/docs/en/query_language/functions/array_functions.md +++ b/docs/en/query_language/functions/array_functions.md @@ -58,11 +58,10 @@ arrayConcat(arrays) - `arrays` – Arbitrary number of arguments of [Array](../../data_types/array.md) type. **Example** -``` sql +```sql SELECT arrayConcat([1, 2], [3, 4], [5, 6]) AS res ``` - -``` +```text ┌─res───────────┐ │ [1,2,3,4,5,6] │ └───────────────┘ @@ -204,7 +203,7 @@ Returns the array \[1, 2, 3, ..., length (arr) \] This function is normally used with ARRAY JOIN. It allows counting something just once for each array after applying ARRAY JOIN. Example: -``` sql +```sql SELECT count() AS Reaches, countIf(num = 1) AS Hits @@ -215,8 +214,7 @@ ARRAY JOIN WHERE CounterID = 160656 LIMIT 10 ``` - -``` +```text ┌─Reaches─┬──Hits─┐ │ 95606 │ 31406 │ └─────────┴───────┘ @@ -224,15 +222,14 @@ LIMIT 10 In this example, Reaches is the number of conversions (the strings received after applying ARRAY JOIN), and Hits is the number of pageviews (strings before ARRAY JOIN). In this particular case, you can get the same result in an easier way: -``` sql +```sql SELECT sum(length(GoalsReached)) AS Reaches, count() AS Hits FROM test.hits WHERE (CounterID = 160656) AND notEmpty(GoalsReached) ``` - -``` +```text ┌─Reaches─┬──Hits─┐ │ 95606 │ 31406 │ └─────────┴───────┘ @@ -248,7 +245,7 @@ For example: arrayEnumerateUniq(\[10, 20, 10, 30\]) = \[1, 1, 2, 1\]. This function is useful when using ARRAY JOIN and aggregation of array elements. Example: -``` sql +```sql SELECT Goals.ID AS GoalID, sum(Sign) AS Reaches, @@ -262,8 +259,7 @@ GROUP BY GoalID ORDER BY Reaches DESC LIMIT 10 ``` - -``` +```text ┌──GoalID─┬─Reaches─┬─Visits─┐ │ 53225 │ 3214 │ 1097 │ │ 2825062 │ 3188 │ 1097 │ @@ -282,11 +278,10 @@ In this example, each goal ID has a calculation of the number of conversions (ea The arrayEnumerateUniq function can take multiple arrays of the same size as arguments. In this case, uniqueness is considered for tuples of elements in the same positions in all the arrays. -``` sql +```sql SELECT arrayEnumerateUniq([1, 1, 1, 2, 2, 2], [1, 1, 2, 1, 1, 2]) AS res ``` - -``` +```text ┌─res───────────┐ │ [1,2,1,1,2,1] │ └───────────────┘ @@ -308,11 +303,10 @@ arrayPopBack(array) **Example** -``` sql +```sql SELECT arrayPopBack([1, 2, 3]) AS res ``` - -``` +```text ┌─res───┐ │ [1,2] │ └───────┘ @@ -332,11 +326,10 @@ arrayPopFront(array) **Example** -``` sql +```sql SELECT arrayPopFront([1, 2, 3]) AS res ``` - -``` +```text ┌─res───┐ │ [2,3] │ └───────┘ @@ -357,11 +350,10 @@ arrayPushBack(array, single_value) **Example** -``` sql +```sql SELECT arrayPushBack(['a'], 'b') AS res ``` - -``` +```text ┌─res───────┐ │ ['a','b'] │ └───────────┘ @@ -382,11 +374,10 @@ arrayPushFront(array, single_value) **Example** -``` sql +```sql SELECT arrayPushBack(['b'], 'a') AS res ``` - -``` +```text ┌─res───────┐ │ ['a','b'] │ └───────────┘ @@ -446,11 +437,10 @@ arraySlice(array, offset[, length]) **Example** -``` sql +```sql SELECT arraySlice([1, 2, NULL, 4, 5], 2, 3) AS res ``` - -``` +```text ┌─res────────┐ │ [2,NULL,4] │ └────────────┘ @@ -464,10 +454,10 @@ Sorts the elements of the `arr` array in ascending order. If the `func` function Example of integer values sorting: -``` sql +```sql SELECT arraySort([1, 3, 3, 0]); ``` -``` +```text ┌─arraySort([1, 3, 3, 0])─┐ │ [0,1,3,3] │ └─────────────────────────┘ @@ -475,10 +465,10 @@ SELECT arraySort([1, 3, 3, 0]); Example of string values sorting: -``` sql +```sql SELECT arraySort(['hello', 'world', '!']); ``` -``` +```text ┌─arraySort(['hello', 'world', '!'])─┐ │ ['!','hello','world'] │ └────────────────────────────────────┘ @@ -486,10 +476,10 @@ SELECT arraySort(['hello', 'world', '!']); Consider the following sorting order for the `NULL`, `NaN` and `Inf` values: -``` sql +```sql SELECT arraySort([1, nan, 2, NULL, 3, nan, -4, NULL, inf, -inf]); ``` -``` +```text ┌─arraySort([1, nan, 2, NULL, 3, nan, -4, NULL, inf, -inf])─┐ │ [-inf,-4,1,2,3,inf,nan,nan,NULL,NULL] │ └───────────────────────────────────────────────────────────┘ @@ -504,10 +494,10 @@ Note that `arraySort` is a [higher-order function](higher_order_functions.md). Y Let's consider the following example: -``` sql +```sql SELECT arraySort((x) -> -x, [1, 2, 3]) as res; ``` -``` +```text ┌─res─────┐ │ [3,2,1] │ └─────────┘ @@ -517,11 +507,10 @@ For each element of the source array, the lambda function returns the sorting ke The lambda function can accept multiple arguments. In this case, you need to pass the `arraySort` function several arrays of identical length that the arguments of lambda function will correspond to. The resulting array will consist of elements from the first input array; elements from the next input array(s) specify the sorting keys. For example: -``` sql +```sql SELECT arraySort((x, y) -> y, ['hello', 'world'], [2, 1]) as res; ``` - -``` +```text ┌─res────────────────┐ │ ['world', 'hello'] │ └────────────────────┘ @@ -531,19 +520,19 @@ Here, the elements that are passed in the second array ([2, 1]) define a sorting Other examples are shown below. -``` sql +```sql SELECT arraySort((x, y) -> y, [0, 1, 2], ['c', 'b', 'a']) as res; ``` -``` sql +```text ┌─res─────┐ │ [2,1,0] │ └─────────┘ ``` -``` sql +```sql SELECT arraySort((x, y) -> -y, [0, 1, 2], [1, 2, 3]) as res; ``` -``` sql +```text ┌─res─────┐ │ [2,1,0] │ └─────────┘ @@ -558,10 +547,10 @@ Sorts the elements of the `arr` array in descending order. If the `func` functio Example of integer values sorting: -``` sql +```sql SELECT arrayReverseSort([1, 3, 3, 0]); ``` -``` +```text ┌─arrayReverseSort([1, 3, 3, 0])─┐ │ [3,3,1,0] │ └────────────────────────────────┘ @@ -569,10 +558,10 @@ SELECT arrayReverseSort([1, 3, 3, 0]); Example of string values sorting: -``` sql +```sql SELECT arrayReverseSort(['hello', 'world', '!']); ``` -``` +```text ┌─arrayReverseSort(['hello', 'world', '!'])─┐ │ ['world','hello','!'] │ └───────────────────────────────────────────┘ @@ -580,10 +569,10 @@ SELECT arrayReverseSort(['hello', 'world', '!']); Consider the following sorting order for the `NULL`, `NaN` and `Inf` values: -``` sql +```sql SELECT arrayReverseSort([1, nan, 2, NULL, 3, nan, -4, NULL, inf, -inf]) as res; ``` -``` sql +```text ┌─res───────────────────────────────────┐ │ [inf,3,2,1,-4,-inf,nan,nan,NULL,NULL] │ └───────────────────────────────────────┘ @@ -596,10 +585,10 @@ SELECT arrayReverseSort([1, nan, 2, NULL, 3, nan, -4, NULL, inf, -inf]) as res; Note that the `arrayReverseSort` is a [higher-order function](higher_order_functions.md). You can pass a lambda function to it as the first argument. Example is shown below. -``` sql +```sql SELECT arrayReverseSort((x) -> -x, [1, 2, 3]) as res; ``` -``` +```text ┌─res─────┐ │ [1,2,3] │ └─────────┘ @@ -612,10 +601,10 @@ The array is sorted in the following way: The lambda function can accept multiple arguments. In this case, you need to pass the `arrayReverseSort` function several arrays of identical length that the arguments of lambda function will correspond to. The resulting array will consist of elements from the first input array; elements from the next input array(s) specify the sorting keys. For example: -``` sql +```sql SELECT arrayReverseSort((x, y) -> y, ['hello', 'world'], [2, 1]) as res; ``` -``` sql +```text ┌─res───────────────┐ │ ['hello','world'] │ └───────────────────┘ @@ -628,18 +617,18 @@ In this example, the array is sorted in the following way: Other examples are shown below. -``` sql +```sql SELECT arrayReverseSort((x, y) -> y, [4, 3, 5], ['a', 'b', 'c']) AS res; ``` -``` sql +```text ┌─res─────┐ │ [5,3,4] │ └─────────┘ ``` -``` sql +```sql SELECT arrayReverseSort((x, y) -> -y, [4, 3, 5], [1, 2, 3]) AS res; ``` -``` sql +```text ┌─res─────┐ │ [4,3,5] │ └─────────┘ diff --git a/docs/fa/interfaces/formats.md b/docs/fa/interfaces/formats.md index 49286b0bdd9..a8c91c73b8a 100644 --- a/docs/fa/interfaces/formats.md +++ b/docs/fa/interfaces/formats.md @@ -328,6 +328,60 @@ JSON با جاوااسکریپت سازگار است. برای اطمینان ا برای پارس کردن، هر ترتیبی برای مقادیر ستون های مختلف پشتیبانی می شود. حذف شدن بعضی مقادیر قابل قبول است، آنها با مقادیر پیش فرض خود برابر هستند. در این مورد، صفر و سطر های خالی به عنوان مقادیر پیش فرض قرار می گیرند. مقادیر پیچیده که می توانند در جدول مشخص شوند، به عنوان مقادیر پیش فرض پشتیبانی نمی شوند. Whitespace بین element ها نادیده گرفته می شوند. اگر کاما بعد از object ها قرار گیرند، نادیده گرفته می شوند. object ها نیازی به جداسازی با استفاده از new line را ندارند. +### Usage of Nested Structures {#jsoneachrow-nested} + +If you have a table with the [Nested](../data_types/nested_data_structures/nested.md) data type columns, you can insert JSON data having the same structure. Enable this functionality with the [input_format_import_nested_json](../operations/settings/settings.md#settings-input_format_import_nested_json) setting. + +For example, consider the following table: + +```sql +CREATE TABLE json_each_row_nested (n Nested (s String, i Int32) ) ENGINE = Memory +``` + +As you can find in the `Nested` data type description, ClickHouse treats each component of the nested structure as a separate column, `n.s` and `n.i` for our table. So you can insert the data the following way: + +```sql +INSERT INTO json_each_row_nested FORMAT JSONEachRow {"n.s": ["abc", "def"], "n.i": [1, 23]} +``` + +To insert data as hierarchical JSON object set [input_format_import_nested_json=1](../operations/settings/settings.md#settings-input_format_import_nested_json). + +```json +{ + "n": { + "s": ["abc", "def"], + "i": [1, 23] + } +} +``` + +Without this setting ClickHouse throws the exception. + +```sql +SELECT name, value FROM system.settings WHERE name = 'input_format_import_nested_json' +``` +```text +┌─name────────────────────────────┬─value─┐ +│ input_format_import_nested_json │ 0 │ +└─────────────────────────────────┴───────┘ +``` +```sql +INSERT INTO json_each_row_nested FORMAT JSONEachRow {"n": {"s": ["abc", "def"], "i": [1, 23]}} +``` +```text +Code: 117. DB::Exception: Unknown field found while parsing JSONEachRow format: n: (at row 1) +``` +```sql +SET input_format_import_nested_json=1 +INSERT INTO json_each_row_nested FORMAT JSONEachRow {"n": {"s": ["abc", "def"], "i": [1, 23]}} +SELECT * FROM json_each_row_nested +``` +```text +┌─n.s───────────┬─n.i────┐ +│ ['abc','def'] │ [1,23] │ +└───────────────┴────────┘ +``` + ## Native کارآمدترین فرمت. داده ها توسط بلاک ها و در فرمت باینری نوشته و خوانده می شوند. برای هر بلاک، تعداد سطرها، تعداد ستون ها، نام ستون ها و type آنها، و بخش هایی از ستون ها در این بلاک یکی پس از دیگری ثبت می شوند. به عبارت دیگر، این فرمت "columnar" است - این فرمت ستون ها را به سطر تبدیل نمی کند. این فرمت در حالت native interface و بین سرور و محیط ترمینال و همچنین کلاینت C++ استفاده می شود. diff --git a/docs/zh/interfaces/formats.md b/docs/zh/interfaces/formats.md index d9ce4f3c1da..65358115295 100644 --- a/docs/zh/interfaces/formats.md +++ b/docs/zh/interfaces/formats.md @@ -327,6 +327,60 @@ ClickHouse 支持 [NULL](../query_language/syntax.md), 在 JSON 格式中以 `nu 对于解析,任何顺序都支持不同列的值。可以省略某些值 - 它们被视为等于它们的默认值。在这种情况下,零和空行被用作默认值。 作为默认值,不支持表中指定的复杂值。元素之间的空白字符被忽略。如果在对象之后放置逗号,它将被忽略。对象不一定必须用新行分隔。 +### Usage of Nested Structures {#jsoneachrow-nested} + +If you have a table with the [Nested](../data_types/nested_data_structures/nested.md) data type columns, you can insert JSON data having the same structure. Enable this functionality with the [input_format_import_nested_json](../operations/settings/settings.md#settings-input_format_import_nested_json) setting. + +For example, consider the following table: + +```sql +CREATE TABLE json_each_row_nested (n Nested (s String, i Int32) ) ENGINE = Memory +``` + +As you can find in the `Nested` data type description, ClickHouse treats each component of the nested structure as a separate column, `n.s` and `n.i` for our table. So you can insert the data the following way: + +```sql +INSERT INTO json_each_row_nested FORMAT JSONEachRow {"n.s": ["abc", "def"], "n.i": [1, 23]} +``` + +To insert data as hierarchical JSON object set [input_format_import_nested_json=1](../operations/settings/settings.md#settings-input_format_import_nested_json). + +```json +{ + "n": { + "s": ["abc", "def"], + "i": [1, 23] + } +} +``` + +Without this setting ClickHouse throws the exception. + +```sql +SELECT name, value FROM system.settings WHERE name = 'input_format_import_nested_json' +``` +```text +┌─name────────────────────────────┬─value─┐ +│ input_format_import_nested_json │ 0 │ +└─────────────────────────────────┴───────┘ +``` +```sql +INSERT INTO json_each_row_nested FORMAT JSONEachRow {"n": {"s": ["abc", "def"], "i": [1, 23]}} +``` +```text +Code: 117. DB::Exception: Unknown field found while parsing JSONEachRow format: n: (at row 1) +``` +```sql +SET input_format_import_nested_json=1 +INSERT INTO json_each_row_nested FORMAT JSONEachRow {"n": {"s": ["abc", "def"], "i": [1, 23]}} +SELECT * FROM json_each_row_nested +``` +```text +┌─n.s───────────┬─n.i────┐ +│ ['abc','def'] │ [1,23] │ +└───────────────┴────────┘ +``` + ## Native {#native} 最高性能的格式。 据通过二进制格式的块进行写入和读取。对于每个块,该块中的行数,列数,列名称和类型以及列的部分将被相继记录。 换句话说,这种格式是 “列式”的 - 它不会将列转换为行。 这是用于在服务器之间进行交互的本地界面中使用的格式,用于使用命令行客户端和 C++ 客户端。